Alice Vale r20181074
Eva Ferrer r20181110
Rafael Sequeira r20181128
Raquel Sousa r20181102
Rogério Paulo m20210597
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datetime import date, timedelta, time, datetime
import matplotlib.pyplot as plt
import seaborn as sns
import math
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import BaseDecisionTree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
# Importing data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
#setting Date as a datetime object
train['Date'] = pd.to_datetime(train['Date'])
train.info()
train.set_index('Access_ID',inplace=True)
non_metric_features = train.select_dtypes(exclude=np.number).set_index(train.index).columns
metric_features = train.select_dtypes(include=np.number).set_index(train.index).columns
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9999 entries, 0 to 9998 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Access_ID 9999 non-null int64 1 Date 9999 non-null datetime64[ns] 2 AccountMng_Pages 9999 non-null int64 3 AccountMng_Duration 9999 non-null float64 4 FAQ_Pages 9999 non-null int64 5 FAQ_Duration 9999 non-null float64 6 Product_Pages 9999 non-null int64 7 Product_Duration 9999 non-null float64 8 GoogleAnalytics_BounceRate 9999 non-null float64 9 GoogleAnalytics_ExitRate 9999 non-null float64 10 GoogleAnalytics_PageValue 9999 non-null float64 11 OS 9999 non-null object 12 Browser 9999 non-null int64 13 Country 9999 non-null object 14 Type_of_Traffic 9999 non-null int64 15 Type_of_Visitor 9999 non-null object 16 Buy 9999 non-null int64 dtypes: datetime64[ns](1), float64(6), int64(7), object(3) memory usage: 1.3+ MB
train.info()
#no missing values
<class 'pandas.core.frame.DataFrame'> Int64Index: 9999 entries, 102863333 to 798444008 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 9999 non-null datetime64[ns] 1 AccountMng_Pages 9999 non-null int64 2 AccountMng_Duration 9999 non-null float64 3 FAQ_Pages 9999 non-null int64 4 FAQ_Duration 9999 non-null float64 5 Product_Pages 9999 non-null int64 6 Product_Duration 9999 non-null float64 7 GoogleAnalytics_BounceRate 9999 non-null float64 8 GoogleAnalytics_ExitRate 9999 non-null float64 9 GoogleAnalytics_PageValue 9999 non-null float64 10 OS 9999 non-null object 11 Browser 9999 non-null int64 12 Country 9999 non-null object 13 Type_of_Traffic 9999 non-null int64 14 Type_of_Visitor 9999 non-null object 15 Buy 9999 non-null int64 dtypes: datetime64[ns](1), float64(6), int64(6), object(3) memory usage: 1.3+ MB
train.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| AccountMng_Pages | 9999.0 | 2.324232 | 3.340676 | 0.0 | 0.0000 | 1.0000 | 4.00000 | 27.0000 |
| AccountMng_Duration | 9999.0 | 81.205854 | 179.715545 | 0.0 | 0.0000 | 7.5000 | 92.20835 | 3398.7500 |
| FAQ_Pages | 9999.0 | 0.508051 | 1.279390 | 0.0 | 0.0000 | 0.0000 | 0.00000 | 24.0000 |
| FAQ_Duration | 9999.0 | 34.559101 | 139.796989 | 0.0 | 0.0000 | 0.0000 | 0.00000 | 2549.3750 |
| Product_Pages | 9999.0 | 31.685869 | 44.550277 | 0.0 | 7.0000 | 18.0000 | 38.00000 | 705.0000 |
| Product_Duration | 9999.0 | 1199.769430 | 1958.276304 | 0.0 | 183.5625 | 599.0000 | 1470.27080 | 63973.5222 |
| GoogleAnalytics_BounceRate | 9999.0 | 0.022305 | 0.048776 | 0.0 | 0.0000 | 0.0032 | 0.01680 | 0.2000 |
| GoogleAnalytics_ExitRate | 9999.0 | 0.043181 | 0.048845 | 0.0 | 0.0143 | 0.0251 | 0.05000 | 0.2000 |
| GoogleAnalytics_PageValue | 9999.0 | 5.963120 | 18.753626 | 0.0 | 0.0000 | 0.0000 | 0.00000 | 361.7637 |
| Browser | 9999.0 | 2.352535 | 1.715698 | 1.0 | 2.0000 | 2.0000 | 2.00000 | 13.0000 |
| Type_of_Traffic | 9999.0 | 3.957496 | 3.675159 | 1.0 | 2.0000 | 2.0000 | 4.00000 | 15.0000 |
| Buy | 9999.0 | 0.155216 | 0.362128 | 0.0 | 0.0000 | 0.0000 | 0.00000 | 1.0000 |
train.describe(include =['O']).T
| count | unique | top | freq | |
|---|---|---|---|---|
| OS | 9999 | 8 | Windows | 5361 |
| Country | 9999 | 9 | Portugal | 3870 |
| Type_of_Visitor | 9999 | 3 | Returner | 8534 |
# Functions
def target_dist (data,target):
from math import ceil
# All Numeric Variables' Histograms in one figure
metric_features = np.array(data.select_dtypes(include=np.number).set_index(data.index).columns)
metric_features = metric_features[metric_features!=target]
sns.set()
# Prepare figure. Create individual axes where each histogram will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each histogram :
for ax, feat in zip(axes.flatten(), metric_features):
sns.barplot(x=(data.groupby(target).mean())[feat].index , y=(data.groupby(target).mean())[feat].values,ax=ax)
plt.ylabel(feat)
# Layout
# Add a centered title to the figure:
title = "Distribution of the target"
plt.suptitle(title)
plt.show()
return()
def histogram(data):
from math import ceil
# All Numeric Variables' Histograms in one figure
metric_features = data.select_dtypes(include=np.number).set_index(data.index).columns
sns.set()
# Prepare figure. Create individual axes where each histogram will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
for ax, feat in zip(axes.flatten(), metric_features): # Notice the zip() function and flatten() method
sns.histplot(data[feat], ax = ax)
#plt.title(feat)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Histograms"
plt.suptitle(title)
plt.show()
return()
def boxplot (data):
from math import ceil
# All Numeric Variables' Histograms in one figure
metric_features = data.select_dtypes(include=np.number).set_index(data.index).columns
sns.set()
# Prepare figure. Create individual axes where each histogram will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
for ax, feat in zip(axes.flatten(), metric_features): # Notice the zip() function and flatten() method
sns.boxplot(data[feat], ax = ax)
#plt.title(feat)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Histograms"
plt.suptitle(title)
plt.show()
def pairwise_relation_hue(data,target):
metric_features = data.select_dtypes(include=np.number).set_index(data.index).columns
# Pairwise Relationship of Numerical Variables
sns.set()
# Setting pairplot
sns.pairplot(data[metric_features],hue=target ,diag_kind="hist")
# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)
plt.show()
return()
def cor_heatmap(cor):
plt.figure(figsize=(15,10))
sns.heatmap(data = cor, annot = True, cmap = plt.cm.Reds, fmt='.1')
plt.show()
def cor_heat_map (corr):
plt.figure(figsize = (30, 30))
sns.heatmap(corr, vmax = 1, vmin=-1, linewidths = 0.1,
annot = True, annot_kws = {"size": 10}, square = True)
plt.show()
def knn_imput (data):
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
imputer.fit(data)
transform = imputer.transform(data)
x = pd.DataFrame(transform, columns = data.columns, index = data.index)
return(x)
def Isolation_Forest (data):
#this algorithm only works with data without missing values
from sklearn.ensemble import IsolationForest
data_out = knn_imput(data.select_dtypes(include=np.number).set_index(data.index))
forest_model = IsolationForest(n_estimators=20,warm_start=True,contamination=0.03)
forest_model.fit(data_out)
anomally = forest_model.decision_function(data_out)
predict_outcome = forest_model.predict(data_out)
#creating a dataset with the density scores and the predicted outcome (-1==outlier; 1==normal obs)
data_out_score = pd.DataFrame(data=anomally, index=data_out.index)
data_out_score['predicted_outcome'] = predict_outcome
return(data_out_score)
# Buy behavior
sns.countplot(train['Buy'])
plt.show()
# Statistiscs by Buy label
for i in train.select_dtypes(include=np.number).columns[:-1]:
print(i)
print(train.groupby('Buy').describe()[i])
print('-'*125)
AccountMng_Pages
count mean std min 25% 50% 75% max
Buy
0 8447.0 2.129158 3.227723 0.0 0.0 0.0 3.0 27.0
1 1552.0 3.385954 3.724543 0.0 0.0 2.0 5.0 26.0
-----------------------------------------------------------------------------------------------------------------------------
AccountMng_Duration
count mean std min 25% 50% 75% max
Buy
0 8447.0 73.915389 173.187232 0.0 0.0 0.0 82.51665 3398.75
1 1552.0 120.885341 207.362647 0.0 0.0 52.0 150.45830 2086.75
-----------------------------------------------------------------------------------------------------------------------------
FAQ_Pages
count mean std min 25% 50% 75% max
Buy
0 8447.0 0.457322 1.226550 0.0 0.0 0.0 0.0 24.0
1 1552.0 0.784149 1.506207 0.0 0.0 0.0 1.0 12.0
-----------------------------------------------------------------------------------------------------------------------------
FAQ_Duration
count mean std min 25% 50% 75% max
Buy
0 8447.0 30.513960 133.538260 0.0 0.0 0.0 0.0 2549.3750
1 1552.0 56.575406 168.218523 0.0 0.0 0.0 18.0 1767.6667
-----------------------------------------------------------------------------------------------------------------------------
Product_Pages
count mean std min 25% 50% 75% max
Buy
0 8447.0 28.745827 40.972573 0.0 6.0 16.0 35.0 705.0
1 1552.0 47.687500 57.869757 0.0 15.0 29.0 57.0 517.0
-----------------------------------------------------------------------------------------------------------------------------
Product_Duration
count mean std min 25% 50% 75% \
Buy
0 8447.0 1077.186936 1856.234753 0.0 150.50 511.5000 1339.77225
1 1552.0 1866.943611 2330.167362 0.0 538.16 1108.1462 2242.08085
max
Buy
0 63973.5222
1 27009.8594
-----------------------------------------------------------------------------------------------------------------------------
GoogleAnalytics_BounceRate
count mean std min 25% 50% 75% max
Buy
0 8447.0 0.025475 0.052212 0.0 0.0 0.0043 0.0200 0.2
1 1552.0 0.005053 0.011775 0.0 0.0 0.0000 0.0063 0.2
-----------------------------------------------------------------------------------------------------------------------------
GoogleAnalytics_ExitRate
count mean std min 25% 50% 75% max
Buy
0 8447.0 0.047536 0.051515 0.0 0.0156 0.0286 0.0542 0.2
1 1552.0 0.019484 0.016218 0.0 0.0093 0.0159 0.0250 0.2
-----------------------------------------------------------------------------------------------------------------------------
GoogleAnalytics_PageValue
count mean std min 25% 50% 75% max
Buy
0 8447.0 2.051908 9.428616 0.0 0.000000 0.00000 0.0000 246.7586
1 1552.0 27.250497 35.302241 0.0 3.660625 16.33915 38.8474 361.7637
-----------------------------------------------------------------------------------------------------------------------------
Browser
count mean std min 25% 50% 75% max
Buy
0 8447.0 2.337161 1.683020 1.0 2.0 2.0 2.0 13.0
1 1552.0 2.436211 1.882028 1.0 2.0 2.0 2.0 13.0
-----------------------------------------------------------------------------------------------------------------------------
Type_of_Traffic
count mean std min 25% 50% 75% max
Buy
0 8447.0 3.973363 3.711809 1.0 2.0 2.0 4.0 15.0
1 1552.0 3.871134 3.468773 1.0 2.0 2.0 4.0 15.0
-----------------------------------------------------------------------------------------------------------------------------
# Target distribution
target_dist(train,'Buy')
()
for index,i in enumerate(train.select_dtypes(include=np.number).columns[:-1]):
plt.figure(index)
sns.barplot(x=(train.groupby('Buy').mean())[i].index , y=(train.groupby('Buy').mean())[i].values)
plt.title(i)
plt.show()
train.groupby(['Type_of_Visitor','Buy']).count()['Type_of_Traffic']
plt.figure()
sns.barplot((train.groupby('Type_of_Visitor').mean()['Buy']).index , (train.groupby('Type_of_Visitor').mean()['Buy']).values)
plt.show()
histogram(train)
()
boxplot(train)
#talvez problematicos: Account_Mng_Duration, FAQ_Pages, FAQ_Duration, Product_Pages,Product_Duration, GoogleAnalytics_PageValue
#Type_of_Traffic e Browser - categorico escondido!
pairwise_relation_hue(train,'Buy')
()
for i in train.select_dtypes(exclude=np.object0).columns[1:]:
fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (18,5))
sns.violinplot(ax = axes[0], x = train['Buy'], y = train[i])
sns.distplot(train[i], hist = True, ax = axes[1])
sns.boxplot(ax = axes[2], x = train['Buy'], y = train[i])
plt.show()
from math import ceil
#Categorical Variables
sns.set()
fig, axes = plt.subplots(2, ceil(len(non_metric_features) / 2), figsize=(20, 11))
for ax, feat in zip(axes.flatten(), non_metric_features): # Notice the zip() function and flatten() method
sns.countplot(train[feat].astype(object), ax=ax, color='#007acc')
title = "Categorical/Low Cardinality Variables' Absolute Frequencies"
plt.suptitle(title)
plt.show()
for index,i in enumerate(train.select_dtypes(include=np.object0)):
plt.figure(index)
sns.lineplot(data=train.groupby(f'{i}').mean()['Buy'])
plt.xticks(rotation=45)
plt.show()
#With outliers:
cor_heatmap(train.corr())
def incoherencias (train):
train_incoherences = train.copy()
# Can't have visited a page and not have time spent there
train_incoherences.loc[((train_incoherences['AccountMng_Pages'] > 0) & (train_incoherences['AccountMng_Duration'] == 0)),'AccountMng_Pages'] = 0
train_incoherences.loc[((train_incoherences['FAQ_Pages'] > 0) & (train_incoherences['FAQ_Duration'] == 0)),'FAQ_Pages'] = 0
train_incoherences.loc[((train_incoherences['Product_Pages'] > 0) & (train_incoherences['Product_Duration'] == 0)),'Product_Pages' ] = 0
# Can't have time spent without visiting the respective page
train_incoherences.loc[((train_incoherences['AccountMng_Pages'] == 0) & (train_incoherences['AccountMng_Duration'] >0))]
train_incoherences.loc[((train_incoherences['FAQ_Pages'] == 0) & (train_incoherences['FAQ_Duration'] >0))]
train_incoherences.loc[((train_incoherences['Product_Pages'] == 0) & (train_incoherences['Product_Duration'] >0))]
# If bought any product, needs to have visited a product page
train_incoherences = train_incoherences.loc[((train_incoherences['Product_Pages']>0) & (train_incoherences['Buy']==1)) | (train_incoherences['Buy']==0)]
train = train_incoherences.copy()
print(train.isnull().sum())
return(train)
train.isnull().sum()
Date 0 AccountMng_Pages 0 AccountMng_Duration 0 FAQ_Pages 0 FAQ_Duration 0 Product_Pages 0 Product_Duration 0 GoogleAnalytics_BounceRate 0 GoogleAnalytics_ExitRate 0 GoogleAnalytics_PageValue 0 OS 0 Browser 0 Country 0 Type_of_Traffic 0 Type_of_Visitor 0 Buy 0 dtype: int64
train['AccountMng_Duration'].sort_values().value_counts()
train['FAQ_Pages'].sort_values().value_counts()
train['FAQ_Duration'].sort_values().value_counts()
train['Product_Pages'].sort_values().value_counts()
train['Product_Duration'].sort_values().value_counts()
train['GoogleAnalytics_PageValue'].sort_values().value_counts()
0.0000 7766
53.9880 5
42.2931 3
9.6411 2
1.0338 2
...
9.5812 1
9.5506 1
9.5425 1
9.5174 1
361.7637 1
Name: GoogleAnalytics_PageValue, Length: 2203, dtype: int64
def outliers (train):
outliers = Isolation_Forest(train)
outliers_test = train.copy()
outliers_test['predicted_outcome'] = outliers['predicted_outcome']
outliers_test.loc[outliers_test['predicted_outcome']==1, 'predicted_outcome']= 0
outliers_test.loc[outliers_test['predicted_outcome']==-1, 'predicted_outcome']= 1
outliers_test['outliers'] = outliers_test['predicted_outcome']
outliers_test['outliers']
outliers_test['outliers'].value_counts()
initial_len = len(train)
train_outliers = train.copy()
train_outliers = train_outliers.loc[train_outliers['AccountMng_Pages']<20]
train_outliers = train_outliers.loc[train_outliers['AccountMng_Duration']<1800]
train_outliers = train_outliers.loc[train_outliers['FAQ_Pages']<15]
train_outliers = train_outliers.loc[train_outliers['FAQ_Duration']<2000]
train_outliers = train_outliers.loc[train_outliers['Product_Pages']<500]
train_outliers = train_outliers.loc[train_outliers['Product_Duration']<20000]
train_outliers = train_outliers.loc[train_outliers['GoogleAnalytics_PageValue']<200]
train_outliers = train_outliers.loc[((train_outliers['GoogleAnalytics_PageValue']<95) & (train_outliers['Buy']==0)) | ((train_outliers['Buy']==1))]
train_outliers = train_outliers.loc[((train_outliers['GoogleAnalytics_ExitRate']<0.150) & (train_outliers['Buy']==1)) |((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['GoogleAnalytics_BounceRate']<0.075) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['Product_Duration']<12500) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['Product_Pages']<290) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['FAQ_Duration']<1250) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['FAQ_Pages']<10) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
train_outliers = train_outliers.loc[((train_outliers['AccountMng_Duration']<1150) & (train_outliers['Buy']==1)) | ((train_outliers['Buy']==0))]
print(len(train_outliers)/initial_len)
train = train_outliers.copy()
outliers_test_df = train.loc[train.index.isin(outliers_test.loc[(outliers_test['outliers'] == 0) | (outliers_test['outliers'] ==1)].index)]
train = train.loc[train.index.isin(outliers_test_df.index)].copy()
print(len(train)/initial_len)
print(len(train))
return(train)
plt.figure()
sns.lineplot(data=train.groupby('Browser').mean()['Buy'])
plt.xticks(rotation=90)
plt.show()
train['Browser'].value_counts()
2 6484 1 1990 4 597 5 362 6 138 10 130 8 113 3 81 13 52 7 41 12 8 11 3 Name: Browser, dtype: int64
train['Type_of_Traffic'].value_counts()
2 3150 1 2008 3 1676 4 870 13 595 10 355 6 354 8 284 5 210 11 205 15 156 12 44 9 34 7 31 14 27 Name: Type_of_Traffic, dtype: int64
#Returners with 0 AccountManagement pages and Pagevalue>0
train.loc[(train['Type_of_Visitor'] == 'Returner') &(train['AccountMng_Pages']==0)
& (train['GoogleAnalytics_PageValue']>0),'Buy'].describe()
count 285.000000 mean 0.708772 std 0.455128 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: Buy, dtype: float64
#Returners with AccountManagementpages>0 and Pagevalue>0
#50,7% bought
train.loc[(train['Type_of_Visitor'] == 'Returner')
&(train['GoogleAnalytics_PageValue']>0)
,'Buy'].describe()
count 1906.000000 mean 0.511018 std 0.500010 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: Buy, dtype: float64
#New Acesses with - AccountMngPages and PageValue>0
#99% bought
train.loc[(train['Type_of_Visitor'] == 'New_Access') &(train['AccountMng_Pages']==0) &
(train['GoogleAnalytics_PageValue']>0)
,'Buy'].describe()
count 106.000000 mean 0.990566 std 0.097129 min 0.000000 25% 1.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: Buy, dtype: float64
#New_accesses with and Pagevalue>0
#85% bought
train.loc[(train['Type_of_Visitor'] == 'New_Access')
&(train['GoogleAnalytics_PageValue']>0)
,'Buy'].describe()
count 313.00000 mean 0.84984 std 0.35780 min 0.00000 25% 1.00000 50% 1.00000 75% 1.00000 max 1.00000 Name: Buy, dtype: float64
# AccountMngPages==0 and PageValue>0
#78% bought
train.loc[(train['AccountMng_Pages']==0) &
(train['GoogleAnalytics_PageValue']>0)
,'Buy'].describe()
count 397.000000 mean 0.788413 std 0.408949 min 0.000000 25% 1.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: Buy, dtype: float64
train.loc[(train['FAQ_Pages']==0) &
(train['GoogleAnalytics_PageValue']>0)
,'Buy'].describe()
count 1351.000000 mean 0.624722 std 0.484374 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: Buy, dtype: float64
train.loc[(train['AccountMng_Pages']>0) &
(train['GoogleAnalytics_PageValue']==0)
,'Buy'].describe()
count 3491.000000 mean 0.056717 std 0.231335 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 Name: Buy, dtype: float64
plt.figure()
sns.lineplot(data=train.groupby('Type_of_Traffic').mean()['Buy'])
plt.xticks(rotation=90)
plt.show()
def ft_engineer (data):
train=data.copy()
#Date Variables
train['Recency'] = pd.to_datetime(date.today()) - pd.to_datetime(train['Date'])
train[['days_since_visit','Days']] = train['Recency'].astype(str).str.split(' ',1,expand=True)
train['days_since_visit'] = train['days_since_visit'].astype(int)
train.drop(columns=['Recency','Days'], inplace=True)
train['Month'] = pd.DatetimeIndex(train['Date']).month
train['Month'] = train['Month'].astype(str)
train['Date'] = pd.to_datetime(train['Date'])
train.loc[train['Date']<'2020-04-01','Before_Break']=1
train['Before_Break'] = train['Before_Break'].fillna(value=0)
train['Before_Break'] = train['Before_Break'].astype(bool)
#creating dummy to take into consideration the type_of_traffic
train.loc[(train['Type_of_Visitor']=='Returner') &(train['Type_of_Traffic']==2), 'Traffic_2Returner'] =1
train['Traffic_2Returner'] = train['Traffic_2Returner'].fillna(value=0)
train['Traffic_2Returner'] = train['Traffic_2Returner'].astype(bool)
train.loc[(train['Type_of_Visitor']=='Returner') &(train['Type_of_Traffic'].isin([3,14,12,13])), 'Traffic_3Returner'] =1
train['Traffic_3Returner'] = train['Traffic_3Returner'].fillna(value=0)
train['Traffic_3Returner'] = train['Traffic_3Returner'].astype(bool)
train.loc[(train['Type_of_Visitor']=='New_Access') &(train['Type_of_Traffic'].isin([4,5,6,8])), 'Traffic_New_Access'] =1
train['Traffic_New_Access'] = train['Traffic_New_Access'].fillna(value=0)
train['Traffic_New_Access'] = train['Traffic_New_Access'].astype(bool)
train.loc[train['Type_of_Traffic']==2,'Traffic_2' ]= 1
train['Traffic_2'] = train['Traffic_2'].fillna(value=0).astype(bool)
train.loc[(train['Type_of_Traffic']==3) | (train['Type_of_Traffic']==1),'Traffic_1&3']= 1
train['Traffic_1&3'] = train['Traffic_1&3'].fillna(value=0)
train['Traffic_1&3'] = train['Traffic_1&3'].astype(bool)
train.loc[(train['Type_of_Traffic']==5) | (train['Type_of_Traffic']==7) |
(train['Type_of_Traffic']==8)|
(train['Type_of_Traffic']==10)| (train['Type_of_Traffic']==11)|
(train['Type_of_Traffic']==15)
,'Traffic_Cluster1' ]= 1
train['Traffic_Cluster1'] = train['Traffic_Cluster1'].fillna(value=0)
train['Traffic_Cluster1'] = train['Traffic_Cluster1'].astype(bool)
train.loc[(train['Type_of_Visitor'] == 'Returner')
&(train['GoogleAnalytics_PageValue']>0), 'Valuable_Returners'] =1
train['Valuable_Returners'] = train['Valuable_Returners'].fillna(value=0).astype(bool)
train.loc[(train['Type_of_Visitor'] == 'New_Access')
&(train['GoogleAnalytics_PageValue']>0), 'Valuable_new_access'] =1
train['Valuable_new_access'] = train['Valuable_new_access'].fillna(value=0).astype(bool)
train.loc[(train['AccountMng_Pages']==0) &
(train['GoogleAnalytics_PageValue']>0)
,'Value_noAccPage'] = 1
train['Value_noAccPage'] = train['Value_noAccPage'].fillna(value=0).astype(bool)
train.loc[(train['FAQ_Pages']==0) &
(train['GoogleAnalytics_PageValue']>0)
,'Value_noFAQPage'] = 1
train['Value_noFAQPage'] = train['Value_noFAQPage'].fillna(value=0).astype(bool)
#average duration per page
for i, ii in zip(train.loc[:,train.columns.str.contains('Pages')].columns, train.loc[:,train.columns.str.contains('Duration')].columns):
train[f'{ii}_Mins'] = train[ii]/60
a = train[ii]/60
train.loc[:,f'avg_{ii}_perPages'] = (a/train[i]).fillna(value=0)
#Page visit information
train['Total_pages'] = train['FAQ_Pages'] + train['Product_Pages'] + train['AccountMng_Pages']
train['total_dur'] = (train['AccountMng_Duration_Mins'])+ (train['FAQ_Duration_Mins'])+ (train['Product_Duration_Mins'])
train.loc[train['Product_Duration_Mins']>0,'Product_Page_rate'] = train['Product_Pages']/train['Product_Duration_Mins']
train['Product_Page_rate'] = train['Product_Page_rate'].fillna(value=0)
#Value-visit information
train.loc[:,'Value_perPage']= (train['GoogleAnalytics_PageValue']/train['Product_Pages']).fillna(value=0)
train['avg_value_PerProdDur'] = (train['GoogleAnalytics_PageValue']/(train['Product_Duration_Mins'])).fillna(value=0)
train['Value_avgProdDuration_rate'] = (train['GoogleAnalytics_PageValue']/train['avg_Product_Duration_perPages']).fillna(value=0)
#Exit and bounce rate features
train['diff_Exit_Bounce'] = train['GoogleAnalytics_ExitRate']-train['GoogleAnalytics_BounceRate']
train.loc[(train['GoogleAnalytics_BounceRate']>0) & (train['GoogleAnalytics_ExitRate']>0),'Bounce_by_Exit_rate']= train['GoogleAnalytics_BounceRate']/train['GoogleAnalytics_ExitRate']
train['Bounce_by_Exit_rate'] = train['Bounce_by_Exit_rate'].fillna(value=0)
#Creating dummies
def oneHotEncoder (data):
from sklearn.preprocessing import OneHotEncoder
non_metric_features =list(data.select_dtypes(include=np.object0).set_index(data.index).columns)
ohc = OneHotEncoder(sparse=False)
ohc_feat = ohc.fit_transform(data[non_metric_features])
names = ohc.get_feature_names()
ohc_cat = pd.DataFrame(data =ohc_feat ,columns = names, index = data.index)
for i in ohc_cat.columns:
ohc_cat[i] = ohc_cat[i].astype(bool)
return(ohc_cat)
ohc_cat = oneHotEncoder(train.drop('Type_of_Visitor',axis=1))
train = pd.concat([train,ohc_cat], axis=1)
train['Type_of_Visitor'].value_counts()
train.loc[((train['Type_of_Visitor'] == 'Other') | (train['Type_of_Visitor'] == 'Returner')), 'New_Access' ] = 0
train.loc[((train['Type_of_Visitor'] == 'New_Access')), 'New_Access' ] = 1
train['New_Access'] = train['New_Access'].astype(bool)
train.set_index('Access_ID',inplace=True)
return(train)
def scatter_hue (data,hue):
metric_features = np.array(data.select_dtypes(include=np.number).columns)
metric_features = metric_features[metric_features!=hue]
for i in metric_features:
col = np.array(metric_features)
col = col[col!=i]
for index, ii in enumerate(col):
plt.figure(index)
sns.scatterplot(data[i],data[ii],hue = data[hue])
plt.show()
scatter_hue(train,'Buy')
for index, i in enumerate(train.select_dtypes(include=np.bool).columns):
plt.figure(index)
sns.barplot((train.groupby(i).mean()['Buy']).index , (train.groupby(i).mean()['Buy']).values)
plt.show()
cor_heat_map(train.corr())
def scalling (x_train,x_val,test,standModel):
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
x = pd.DataFrame(x_train)
if standModel == 'standardize':
scaler = StandardScaler().fit(x)
if standModel == 'normalize':
scaler = MinMaxScaler(feature_range=(-1,1)).fit(x)
if standModel == 'robust':
scaler = RobustScaler().fit(x)
x_train_scaled = scaler.transform(x) # this will return an array
# Convert the array to a pandas dataframe
x_train_scaled = pd.DataFrame(x_train_scaled, columns = x.columns).set_index(x.index)
x_val_scaled = pd.DataFrame(scaler.transform(x_val), index = x_val.index,columns=x_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(test),index=test.index,columns=x_train.columns)
scaled_data = [x_train_scaled,x_val_scaled,x_test_scaled]
return(scaled_data)
def split(x,y):
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x.copy(),y.copy(), test_size = 0.2, random_state = 20,stratify=y,
shuffle=True)
return(X_train,y_train,X_val,y_val)
def super_split (data):
returns = []
x = data.drop('Buy',axis=1).copy()
y = data['Buy'].copy()
metric_features = x.select_dtypes(include=np.number).columns
non_metric_features = x.select_dtypes(exclude=np.number).columns
modelling_data_out = split(x,y)
x_train = modelling_data_out[0]
x_val = modelling_data_out[2]
y_train = modelling_data_out[1]
y_val = modelling_data_out[3]
returns = [x_train, x_val, y_train, y_val]
return(returns)
def fit_assess (model , x_train,y_train, x_val, y_val):
model = model
model = stratKfold(x_train,y_train,model)
print_assess(model)
R2 = model['model'].score(x_val,y_val)
print(f'The R2 of the model is {R2}')
print(model['scores'])
return(model['model'])
def print_assess (data):
print(f'Avg F1 Score (Train):')
print(np.mean(data['scores']['f1_train']))
print(f'Avg F1 Score (Validation):')
print(data['scores']['f1_test'].mean())
print(f'Std F1 Score (Train):')
print((data['scores']['f1_train']).std())
print(f'Std F1 Score (Validation):')
print(data['scores']['f1_test'].std())
train_out = outliers(train)
train_out = incoherencias(train_out)
train_out.reset_index(inplace=True)
train_out = ft_engineer(train_out) #data without outliers
train_orig = incoherencias(train)
train_orig.reset_index(inplace=True)
train_orig = ft_engineer(train_orig) #data with outliers
0.9907990799079908 0.9907990799079908 9907 Date 0 AccountMng_Pages 0 AccountMng_Duration 0 FAQ_Pages 0 FAQ_Duration 0 Product_Pages 0 Product_Duration 0 GoogleAnalytics_BounceRate 0 GoogleAnalytics_ExitRate 0 GoogleAnalytics_PageValue 0 OS 0 Browser 0 Country 0 Type_of_Traffic 0 Type_of_Visitor 0 Buy 0 dtype: int64 Date 0 AccountMng_Pages 0 AccountMng_Duration 0 FAQ_Pages 0 FAQ_Duration 0 Product_Pages 0 Product_Duration 0 GoogleAnalytics_BounceRate 0 GoogleAnalytics_ExitRate 0 GoogleAnalytics_PageValue 0 OS 0 Browser 0 Country 0 Type_of_Traffic 0 Type_of_Visitor 0 Buy 0 dtype: int64
y_out = train_out.loc[:,'Buy'].copy()
x_out = train_out.drop('Buy',axis=1).copy()
x_orig = train_orig.drop('Buy',axis=1).copy()
y_orig = train_orig.loc[:,'Buy'].copy()
metric_features_x = x_out.select_dtypes(include=np.number).set_index(x_out.index).columns
non_metrics_x = x_out.select_dtypes(exclude=np.number).set_index(x_out.index).columns
non_cat_features = x_out.select_dtypes(exclude=np.object_).set_index(x_out.index).columns
train_out.drop(['Date','OS','Country','Type_of_Visitor','Month', 'Type_of_Traffic', 'Browser'],axis=1,inplace=True)
train_orig.drop(['Date','OS','Country','Type_of_Visitor','Month', 'Type_of_Traffic', 'Browser'],axis=1,inplace=True)
data_out = super_split(train_out)
data_orig = super_split(train_orig)
y_train_out = data_out[2]
y_val_out = data_out[3]
y_train_orig = data_orig[2]
y_val_orig = data_orig[3]
x_train_out = data_out[0]
x_val_out = data_out[1]
x_train_orig = data_orig[0]
x_val_orig = data_orig[1]
test = ft_engineer(test)
test.drop(['Month','Date','OS','Country','Type_of_Visitor', 'Type_of_Traffic', 'Browser'],axis=1,inplace=True)
data_dict ={}
for i in ['normalize', 'standardize','robust']:
data_dict[i] = scalling(data_out[0],data_out[1], test, i)
trains = []
validations = []
testes = []
for i in data_dict.keys():
for ii in range(len(data_dict[i])):
if ii == 0:
trains.append(data_dict[i][ii])
if ii == 1:
validations.append(data_dict[i][ii])
if ii == 2:
testes.append(data_dict[i][ii])
x_train_norm_out = trains[0]
x_train_stand_out = trains[1]
x_train_robust_out = trains[2]
x_val_norm_out = validations[0]
x_val_stand_out = validations[1]
x_val_robust_out = validations[2]
test_norm_out = testes[0]
test_stand_out = testes[1]
test_robust_out = testes[2]
data_dict_orig ={}
for i in ['normalize', 'standardize','robust']:
data_dict_orig[i] = scalling(data_orig[0],data_orig[1], test, i)
trains = []
validations = []
testes = []
for i in data_dict_orig.keys():
for ii in range(len(data_dict_orig[i])):
if ii == 0:
trains.append(data_dict_orig[i][ii])
if ii == 1:
validations.append(data_dict_orig[i][ii])
if ii == 2:
testes.append(data_dict_orig[i][ii])
x_train_norm = trains[0]
x_train_stand = trains[1]
x_train_robust = trains[2]
x_val_norm = validations[0]
x_val_stand = validations[1]
x_val_robust = validations[2]
test_norm = testes[0]
test_stand = testes[1]
test_robust = testes[2]
non_cat_features = x_train_stand.select_dtypes(exclude=np.object_).set_index(x_train_stand.index).columns
x_train_metrics = x_train_stand.loc[:,non_cat_features].copy()
x_val_metrics = x_val_stand.loc[:,non_cat_features].copy()
def PCA (data,indexes, n_components):
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as PCA
data.reset_index()
metric_features = list(data.select_dtypes(include=np.number).set_index(data.index).columns)
x = data.loc[:,metric_features]
stand = StandardScaler().fit_transform(x)
s = pd.DataFrame(stand, columns= x.columns)
s.set_index(data.index)
pca = PCA(n_components=n_components)
components = pd.DataFrame(pca.fit_transform(s), index = data.index)
df = pd.DataFrame(
{"Eigenvalue": pca.explained_variance_,
"Difference": np.insert(np.diff(pca.explained_variance_), 0, 0),
"Proportion": pca.explained_variance_ratio_,
"Cumulative": np.cumsum(pca.explained_variance_ratio_)},
index=range(1, pca.n_components_ + 1))
PC = pca
plt.figure()
cum_sum = PC.explained_variance_ratio_.cumsum()
exp_var = PC.explained_variance_
cov = PC.get_covariance()
plt.figure(figsize=(10,10))
sns.lineplot(data = df.loc[:,['Eigenvalue']])
plt.axhline(1,ls='--')
plt.show()
plt.figure(figsize=(10,10))
sns.lineplot(data = df.loc[:,['Cumulative','Proportion']])
plt.show()
print('The variance explained by each component is: ' + str(exp_var))
print('The total variance explained by the components is: '+ str(sum(PC.explained_variance_ratio_)))
return(df, components)
pca = PCA(train_out.loc[:,train_out.select_dtypes(include=np.number).columns].drop('Buy',axis=1),
train_out.loc[:,train_out.select_dtypes(include=np.number).columns].drop('Buy',axis=1).index,6)
<Figure size 432x288 with 0 Axes>
The variance explained by each component is: [6.51762132 2.94794425 2.5261752 2.38255609 2.14746103 1.27213884] The total variance explained by the components is: 0.7413374660122836
def PCA_Assess_loadings (data, components, n_components):
df = pd.concat([data, components], axis=1)
loadings = df.corr().iloc[:-n_components,-n_components:]
def _color_red_or_green(val):
if val < -0.45:
color = 'background-color: red'
elif val > 0.45:
color = 'background-color: green'
else:
color = ''
return color
return(loadings.style.applymap(_color_red_or_green))
PCA_Assess_loadings(train_out.select_dtypes(include=np.number).drop('Buy',axis=1),pca[1],6)
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| AccountMng_Pages | 0.623891 | 0.101471 | -0.076279 | 0.352846 | 0.114225 | 0.216325 |
| AccountMng_Duration | 0.552974 | 0.142376 | -0.157949 | 0.738562 | 0.224678 | -0.008030 |
| FAQ_Pages | 0.570613 | -0.135615 | 0.388119 | 0.085206 | -0.076722 | 0.097203 |
| FAQ_Duration | 0.540345 | -0.217697 | 0.751719 | 0.103173 | -0.234796 | -0.019766 |
| Product_Pages | 0.827159 | -0.116626 | -0.212606 | -0.354775 | 0.118392 | 0.200809 |
| Product_Duration | 0.858013 | -0.142417 | -0.210682 | -0.357796 | 0.105035 | -0.098653 |
| GoogleAnalytics_BounceRate | -0.376267 | -0.435159 | 0.268641 | -0.025788 | 0.721987 | -0.052901 |
| GoogleAnalytics_ExitRate | -0.485774 | -0.434664 | 0.243646 | -0.029988 | 0.522530 | -0.061010 |
| GoogleAnalytics_PageValue | 0.160191 | 0.752853 | 0.262225 | -0.181298 | 0.232573 | -0.050498 |
| days_since_visit | -0.177883 | -0.071423 | 0.049937 | 0.089521 | -0.061309 | -0.068118 |
| AccountMng_Duration_Mins | 0.552974 | 0.142376 | -0.157949 | 0.738562 | 0.224678 | -0.008030 |
| avg_AccountMng_Duration_perPages | 0.345716 | 0.165201 | -0.156310 | 0.662472 | 0.152413 | -0.137977 |
| FAQ_Duration_Mins | 0.540345 | -0.217697 | 0.751719 | 0.103173 | -0.234796 | -0.019766 |
| avg_FAQ_Duration_perPages | 0.423323 | -0.186784 | 0.689297 | 0.077529 | -0.237662 | -0.062172 |
| Product_Duration_Mins | 0.858013 | -0.142417 | -0.210682 | -0.357796 | 0.105035 | -0.098653 |
| avg_Product_Duration_perPages | 0.190507 | 0.069169 | -0.112171 | -0.046595 | -0.188445 | -0.778929 |
| Total_pages | 0.854706 | -0.107256 | -0.197442 | -0.308242 | 0.119658 | 0.211379 |
| total_dur | 0.904163 | -0.137444 | -0.160578 | -0.263128 | 0.103707 | -0.095740 |
| Product_Page_rate | -0.152991 | 0.029055 | 0.026312 | 0.039347 | -0.142238 | 0.661867 |
| Value_perPage | 0.002025 | 0.722491 | 0.277124 | -0.182554 | 0.211707 | -0.105786 |
| avg_value_PerProdDur | -0.007238 | 0.624125 | 0.261202 | -0.171234 | 0.212448 | 0.037015 |
| Value_avgProdDuration_rate | 0.116976 | 0.772336 | 0.285208 | -0.193751 | 0.254588 | 0.075305 |
| diff_Exit_Bounce | -0.263554 | 0.000238 | -0.059382 | -0.010133 | -0.476968 | -0.019571 |
| Bounce_by_Exit_rate | -0.159831 | -0.415995 | 0.212477 | -0.028864 | 0.697182 | -0.034513 |
x_pca = pca[1]
pca_df = x_pca.join(train_out['Buy'])
x_pca1= pca_df.drop('Buy',axis=1)
y_pca1= pca_df.loc[:,'Buy']
x_train_pca,y_train_pca ,x_val_pca,y_val_pca = split(x_pca1,y_pca1)
x_metrics = x_train_stand.loc[:,non_cat_features].copy()
def mutual_information_classification (x,y,n_best):
import sklearn
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
n_best = n_best
scores = list(mutual_info_classif(x, y))
high_score_features1 = []
for i, item in sorted(zip(scores, x.columns), reverse=True)[:n_best]:
print(item, i)
high_score_features1.append(item)
return(high_score_features1)
set1 = mutual_information_classification(x_metrics,y_train_orig,10)
Value_perPage 0.16762198491042812 avg_value_PerProdDur 0.16539307213359566 GoogleAnalytics_PageValue 0.16397818039716383 Value_avgProdDuration_rate 0.155457455942825 Value_noFAQPage 0.10628681136071982 Valuable_Returners 0.08962032339570536 GoogleAnalytics_ExitRate 0.04395871468245294 Value_noAccPage 0.0416458303871301 total_dur 0.03648562484143403 Valuable_new_access 0.035366905600862086
def RFE (X_train,X_validation,y_train,y_validation, n_best, model):
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
threshold = n_best # number of most relevant features
model = model
ft = model.fit(X_train,y_train)
print(f'The validation score is {ft.score(X_validation,y_validation)}')
print(f'The train score is {ft.score(X_train,y_train)}')
selector = RFE(estimator=model, n_features_to_select=n_best)
selector_x = selector.fit_transform(X_train,y_train)
selector_ind = selector.get_support()
selector.ranking_
selected_features = pd.Series(selector.support_, index = X_train.columns)
selected_features = selected_features.sort_values(ascending=False).iloc[:n_best]
return(selected_features)
from sklearn.ensemble import RandomForestClassifier
set2 = RFE(x_train_metrics,x_val_metrics,y_train_orig,y_val_orig,10,RandomForestClassifier(max_depth=3,max_features=0.6,random_state=0))
The validation score is 0.8913913913913913 The train score is 0.902014766612439
def f_classification (x,y,n_best):
from sklearn.feature_selection import f_classif
threshold = 10 # number of most relevant features
high_score_features1 = []
feature_scores = f_classif(x, y)[0]
for score, f_name in sorted(zip(feature_scores, x.columns), reverse=True)[:threshold]:
print(f_name, score)
high_score_features1.append(f_name)
return(high_score_features1)
set3 =f_classification(x_metrics,y_train_orig,10)
Value_noFAQPage 2976.314377867499 GoogleAnalytics_PageValue 2453.882257764635 Valuable_Returners 2369.451332721842 Value_avgProdDuration_rate 1524.5453810791237 Value_noAccPage 1310.853989950417 Valuable_new_access 1050.5067704263183 Value_perPage 1034.1785671997714 GoogleAnalytics_ExitRate 370.2297570147004 avg_value_PerProdDur 254.71593657012272 x2_11 211.0127638188554
def lasso_selection (x, y):
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(x,y)
coef = pd.Series(reg.coef_, index = x.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
return(coef.loc[abs(coef.values)>0])
set4 = lasso_selection(x_metrics, y_train_orig)
def plot_importance(coef,name):
imp_coef = coef.sort_values()
plt.figure(figsize=(8,10))
imp_coef.plot(kind = "barh")
plt.title("Feature importance using " + name + " Model")
plt.show()
plot_importance(set4[:],'Lasso')
Lasso picked 39 variables and eliminated the other 24 variables
def dt_feature_selection_classification(X_train,y_train,n_best):
gini_importance = DecisionTreeClassifier().fit(X_train, y_train).feature_importances_
entropy_importance = DecisionTreeClassifier(criterion='entropy').fit(X_train, y_train).feature_importances_
zippy = pd.DataFrame(zip(gini_importance, entropy_importance), columns = ['gini','entropy'])
zippy['col'] = X_train.columns
tidy = zippy.melt(id_vars='col').rename(columns=str.title)
tidy.sort_values(['Value'], ascending = False, inplace = True)
plt.figure(figsize=(15,8))
print(sns.barplot(y='Col', x='Value', hue='Variable', data=tidy))
return(tidy.iloc[:n_best].loc[:,'Col'].values)
set5 = dt_feature_selection_classification(x_metrics,y_train_orig,10)
AxesSubplot(0.125,0.125;0.775x0.755)
ft_selection = pd.DataFrame(index= x_metrics.columns, columns= {'Mutual information', 'RFE', 'F_Classification', 'DT'})
ft_selection.loc[ft_selection.index.isin(set1),'Mutual information'] = 1
ft_selection.loc[ft_selection.index.isin(set2.index),'RFE'] = 1
ft_selection.loc[ft_selection.index.isin(set3),'F_Classification'] = 1
ft_selection.loc[ft_selection.index.isin(set4.index),'Lasso'] = 1
ft_selection.loc[ft_selection.index.isin(set5),'DT'] = 1
ft_selection.fillna(value=0,inplace=True)
ft_selection['Total']=0
for i in ft_selection.columns[:-1]:
ft_selection['Total'] = ft_selection['Total'] + ft_selection[i]
ft_selection.sort_values('Total',ascending=False)
| RFE | Mutual information | DT | F_Classification | Lasso | Total | |
|---|---|---|---|---|---|---|
| Value_perPage | 1 | 1 | 1 | 1 | 1.0 | 5.0 |
| GoogleAnalytics_ExitRate | 1 | 1 | 1 | 1 | 1.0 | 5.0 |
| GoogleAnalytics_PageValue | 1 | 1 | 1 | 1 | 1.0 | 5.0 |
| Value_avgProdDuration_rate | 1 | 1 | 1 | 1 | 0.0 | 4.0 |
| avg_value_PerProdDur | 1 | 1 | 0 | 1 | 1.0 | 4.0 |
| ... | ... | ... | ... | ... | ... | ... |
| AccountMng_Duration_Mins | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
| avg_FAQ_Duration_perPages | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
| avg_Product_Duration_perPages | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
| x0_Android | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
| x1_Other | 0 | 0 | 0 | 0 | 0.0 | 0.0 |
63 rows × 6 columns
subset1 = ['AccountMng_Pages', 'avg_value_PerProdDur','Valuable_new_access',
'Value_noFAQPage', 'Value_noAccPage', 'x2_11', 'Valuable_Returners', 'Bounce_by_Exit_rate',
'GoogleAnalytics_PageValue','New_Access','Traffic_1&3']
subset2 = ['AccountMng_Pages', 'avg_value_PerProdDur','Valuable_new_access',
'Value_noFAQPage', 'Value_noAccPage', 'x2_11', 'Valuable_Returners', 'Bounce_by_Exit_rate',
'Value_avgProdDuration_rate','New_Access','Traffic_1&3']
subset3 = ['AccountMng_Pages', 'avg_value_PerProdDur','Valuable_new_access',
'Value_noFAQPage', 'Value_noAccPage', 'x2_11', 'Valuable_Returners', 'Bounce_by_Exit_rate',
'GoogleAnalytics_PageValue','Traffic_1&3']
subset4 = ['AccountMng_Pages', 'Value_perPage','Valuable_new_access',
'Value_noFAQPage', 'Value_noAccPage', 'x2_11', 'Valuable_Returners', 'diff_Exit_Bounce',
'GoogleAnalytics_PageValue','Traffic_1&3']
subset5 = ['AccountMng_Pages','AccountMng_Duration_Mins',
'FAQ_Duration_Mins', 'Product_Duration_Mins', 'x2_11', 'Valuable_Returners', 'GoogleAnalytics_ExitRate',
'GoogleAnalytics_PageValue','FAQ_Pages','New_Access']
Preliminary assessment of the performance of subsets created
def full_assessment (estimators,names,x_train,y_train,x_val,y_val):
names= names
moddeling = multiple_algorithms(x_train,y_train,x_val,y_val,estimators=estimators,names=names)
print(pd.DataFrame(moddeling[0].loc[:,'f1_Validation']))
roc_curve(estimators,names,x_train,y_train,x_val,y_val)
for i ,name in zip(estimators,names):
print(precision_recall_curve(i,name,x_train,y_train,x_val,y_val))
return(moddeling)
def multiple_algorithms (x_train, y_train, x_val,y_val, estimators,names):
results = pd.DataFrame(index=pd.Series(names).astype(str).str[:5])
scores = []
recall = []
precision = []
models = {}
ft_importance = pd.DataFrame(index = x_train.columns)
for i in estimators:
print(i)
sk = stratKfold(x_train.loc[:,:],y_train,i)
scores.append(sk['scores'].loc[:,'f1_test'].mean())
recall.append(sk['scores'].loc[:,'recall_test'].mean())
precision.append(sk['scores'].loc[:,'precision_test'].mean())
models[i] = sk['model']
pred_train = sk['model'].predict(x_train.loc[:,])
pred_val = sk['model'].predict(x_val.loc[:,:])
print(sk['scores'].loc[:,'f1_test'].mean())
print('-'*125)
results['f1_Validation'] = scores
results['recall_validation'] = recall
results['precision_validation'] = precision
returns = [results, models]
return(returns)
def stratKfold (X_train,y_train, model):
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(10,shuffle=True)
avg_score_train = []
avg_score_test = []
avg_precision_test = []
avg_precision_train = []
avg_recall_test = []
avg_recall_train = []
scores = pd.DataFrame()
train_score = []
test_score = []
train_recall = []
test_recall = []
train_precision = []
test_precision = []
for train_index, test_index in skf.split(X_train,y_train):
x_train, x_test = X_train.iloc[train_index], X_train.iloc[test_index]
Y_train, Y_test = y_train.iloc[train_index], y_train.iloc[test_index]
algo = model.fit(x_train, Y_train)
f1_train = f1_score(Y_train,algo.predict(x_train))
recall_train = recall_score(Y_train,algo.predict(x_train))
precision_train = precision_score(Y_train,algo.predict(x_train))
f1_test = f1_score(Y_test,algo.predict(x_test))
recall_test = recall_score(Y_test,algo.predict(x_test))
precision_test = precision_score(Y_test,algo.predict(x_test))
train_score.append(f1_train)
train_recall.append(recall_train)
train_precision.append(precision_train)
test_score.append(f1_test)
test_recall.append(recall_test)
test_precision.append(precision_test)
avg_score_train.append(np.round(np.mean(train_score),3))
avg_score_test.append(np.round(np.mean(test_score),3))
avg_recall_test.append(np.round(np.mean(test_recall),3))
avg_recall_train.append(np.round(np.mean(train_recall),3))
avg_precision_test.append(np.round(np.mean(test_precision),3))
avg_precision_train.append(np.round(np.mean(train_precision),3))
scores['f1_train'] = avg_score_train
scores['recall_train'] = avg_recall_train
scores['precision_train'] = avg_precision_train
scores['f1_test'] = avg_score_test
scores['recall_test'] = avg_recall_test
scores['precision_test'] = avg_precision_test
returns = {}
returns['scores'] = scores
returns['model'] = model
model_assess = pd.DataFrame(index = returns['scores'].index)
model_assess['train_score'] = avg_score_train
model_assess['test_score'] = avg_score_test
return(returns)
def roc_curve (models,names,X_train,y_train, X_val, y_val):
names= names
from sklearn.metrics import roc_curve
for i, names ,in zip(models,names):
i.fit(X_train,y_train)
thresh_dict = {}
prob_model = i.predict_proba(X_val)
fpr_DT_ent, tpr_DT_ent, thresholds_DT_ent = roc_curve(y_val, prob_model[:,1])
plt.plot(fpr_DT_ent, tpr_DT_ent, label=f"ROC Curve {names}")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.show()
def precision_recall_curve (model,names, X_train,y_train,x_val,y_val):
names=names
model.fit(X_train,y_train)
proba = model.predict_proba(x_val)
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_val, proba[:,1])
# apply f1 score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
plt.plot(recall, precision, marker='.', label=f'{names}')
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()
estimators = [GaussianNB(), LogisticRegression(class_weight='balanced'),
DecisionTreeClassifier(class_weight='balanced',max_depth=3),
KNeighborsClassifier(), GradientBoostingClassifier(n_estimators=200,max_depth=4)]
names = ['Naive_Bayes','LR_Balanced','DecisionTree_Balanced','KNN','GradientBooster']
subset1_assess = full_assessment(estimators,names,x_train_stand_out[subset1],y_train_out,x_val_stand_out[subset1],y_val_out)
subset1_assess[0]
GaussianNB()
0.6313
-----------------------------------------------------------------------------------------------------------------------------
LogisticRegression(class_weight='balanced')
0.6464
-----------------------------------------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight='balanced', max_depth=3)
0.644
-----------------------------------------------------------------------------------------------------------------------------
KNeighborsClassifier()
0.6222000000000001
-----------------------------------------------------------------------------------------------------------------------------
GradientBoostingClassifier(max_depth=4, n_estimators=200)
0.6565999999999999
-----------------------------------------------------------------------------------------------------------------------------
f1_Validation
Naive 0.6313
LR_Ba 0.6464
Decis 0.6440
KNN 0.6222
Gradi 0.6566
Best Threshold=0.819115, F-Score=0.677
None Best Threshold=1.000000, F-Score=nan
None Best Threshold=0.744774, F-Score=0.676
None Best Threshold=0.400000, F-Score=0.617
None Best Threshold=0.482188, F-Score=0.674
None
| f1_Validation | recall_validation | precision_validation | |
|---|---|---|---|
| Naive | 0.6313 | 0.6634 | 0.6048 |
| LR_Ba | 0.6464 | 0.8383 | 0.5265 |
| Decis | 0.6440 | 0.7798 | 0.5486 |
| KNN | 0.6222 | 0.5828 | 0.6692 |
| Gradi | 0.6566 | 0.6087 | 0.7150 |
subset2_assess = full_assessment(estimators,names,x_train_stand_out[subset2],y_train_out,x_val_stand_out[subset2],y_val_out)
subset2_assess[0]
GaussianNB()
0.6451
-----------------------------------------------------------------------------------------------------------------------------
LogisticRegression(class_weight='balanced')
0.6287
-----------------------------------------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight='balanced', max_depth=3)
0.6464000000000001
-----------------------------------------------------------------------------------------------------------------------------
KNeighborsClassifier()
0.6249
-----------------------------------------------------------------------------------------------------------------------------
GradientBoostingClassifier(max_depth=4, n_estimators=200)
0.6384999999999998
-----------------------------------------------------------------------------------------------------------------------------
f1_Validation
Naive 0.6451
LR_Ba 0.6287
Decis 0.6464
KNN 0.6249
Gradi 0.6385
Best Threshold=0.014376, F-Score=0.671
None Best Threshold=1.000000, F-Score=nan
None Best Threshold=0.744774, F-Score=0.676
None Best Threshold=0.400000, F-Score=0.623
None Best Threshold=0.338423, F-Score=0.675
None
| f1_Validation | recall_validation | precision_validation | |
|---|---|---|---|
| Naive | 0.6451 | 0.6550 | 0.6379 |
| LR_Ba | 0.6287 | 0.8263 | 0.5080 |
| Decis | 0.6464 | 0.7905 | 0.5476 |
| KNN | 0.6249 | 0.5898 | 0.6675 |
| Gradi | 0.6385 | 0.5825 | 0.7093 |
subset3_assess = full_assessment(estimators,names,x_train_stand_out[subset3],y_train_out,x_val_stand_out[subset3],y_val_out)
subset3_assess[0]
GaussianNB()
0.6477999999999999
-----------------------------------------------------------------------------------------------------------------------------
LogisticRegression(class_weight='balanced')
0.6462999999999999
-----------------------------------------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight='balanced', max_depth=3)
0.6555000000000001
-----------------------------------------------------------------------------------------------------------------------------
KNeighborsClassifier()
0.6043000000000001
-----------------------------------------------------------------------------------------------------------------------------
GradientBoostingClassifier(max_depth=4, n_estimators=200)
0.6336
-----------------------------------------------------------------------------------------------------------------------------
f1_Validation
Naive 0.6478
LR_Ba 0.6463
Decis 0.6555
KNN 0.6043
Gradi 0.6336
Best Threshold=0.858146, F-Score=0.677
None Best Threshold=1.000000, F-Score=nan
None Best Threshold=0.744774, F-Score=0.676
None Best Threshold=0.400000, F-Score=0.627
None Best Threshold=0.399319, F-Score=0.680
None
| f1_Validation | recall_validation | precision_validation | |
|---|---|---|---|
| Naive | 0.6478 | 0.6721 | 0.6261 |
| LR_Ba | 0.6463 | 0.7918 | 0.5462 |
| Decis | 0.6555 | 0.7985 | 0.5560 |
| KNN | 0.6043 | 0.5509 | 0.6702 |
| Gradi | 0.6336 | 0.5703 | 0.7182 |
subset4_assess = full_assessment(estimators,names,x_train_stand_out[subset4],y_train_out,x_val_stand_out[subset4],y_val_out)
subset4_assess[0]
GaussianNB()
0.6407999999999998
-----------------------------------------------------------------------------------------------------------------------------
LogisticRegression(class_weight='balanced')
0.6628000000000001
-----------------------------------------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight='balanced', max_depth=3)
0.6667
-----------------------------------------------------------------------------------------------------------------------------
KNeighborsClassifier()
0.6123999999999999
-----------------------------------------------------------------------------------------------------------------------------
GradientBoostingClassifier(max_depth=4, n_estimators=200)
0.6155999999999999
-----------------------------------------------------------------------------------------------------------------------------
f1_Validation
Naive 0.6408
LR_Ba 0.6628
Decis 0.6667
KNN 0.6124
Gradi 0.6156
Best Threshold=0.895423, F-Score=0.680
None Best Threshold=0.706254, F-Score=0.689
None Best Threshold=0.706801, F-Score=0.676
None Best Threshold=0.400000, F-Score=0.617
None Best Threshold=0.421297, F-Score=0.701
None
| f1_Validation | recall_validation | precision_validation | |
|---|---|---|---|
| Naive | 0.6408 | 0.6709 | 0.6143 |
| LR_Ba | 0.6628 | 0.8178 | 0.5574 |
| Decis | 0.6667 | 0.8174 | 0.5634 |
| KNN | 0.6124 | 0.5675 | 0.6682 |
| Gradi | 0.6156 | 0.5599 | 0.6863 |
subset5_assess = full_assessment(estimators,names,x_train_stand_out[subset5],y_train_out,x_val_stand_out[subset5],y_val_out)
subset5_assess[0]
GaussianNB()
0.6176999999999999
-----------------------------------------------------------------------------------------------------------------------------
LogisticRegression(class_weight='balanced')
0.6318
-----------------------------------------------------------------------------------------------------------------------------
DecisionTreeClassifier(class_weight='balanced', max_depth=3)
0.6303
-----------------------------------------------------------------------------------------------------------------------------
KNeighborsClassifier()
0.6277999999999999
-----------------------------------------------------------------------------------------------------------------------------
GradientBoostingClassifier(max_depth=4, n_estimators=200)
0.6518
-----------------------------------------------------------------------------------------------------------------------------
f1_Validation
Naive 0.6177
LR_Ba 0.6318
Decis 0.6303
KNN 0.6278
Gradi 0.6518
Best Threshold=0.620913, F-Score=0.631
None Best Threshold=0.715206, F-Score=0.671
None Best Threshold=0.693043, F-Score=0.676
None Best Threshold=0.400000, F-Score=0.619
None Best Threshold=0.405042, F-Score=0.690
None
| f1_Validation | recall_validation | precision_validation | |
|---|---|---|---|
| Naive | 0.6177 | 0.7562 | 0.5231 |
| LR_Ba | 0.6318 | 0.8321 | 0.5097 |
| Decis | 0.6303 | 0.9037 | 0.4842 |
| KNN | 0.6278 | 0.5732 | 0.6959 |
| Gradi | 0.6518 | 0.5949 | 0.7214 |
Conclusion:
Note: it was used standardized data without outliers to perform the tests since its the data that generally works better for algorithms, although there might be some exceptions
def metrics(y_train, pred_train , y_val, pred_val):
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print('___________________________________________________________________________________________________________')
print(' TRAIN ')
print('-----------------------------------------------------------------------------------------------------------')
print(classification_report(y_train, pred_train))
print(confusion_matrix(y_train, pred_train))
print('___________________________________________________________________________________________________________')
print(' VALIDATION ')
print('-----------------------------------------------------------------------------------------------------------')
print(classification_report(y_val, pred_val))
print(confusion_matrix(y_val, pred_val))
def Model_test (model, x_train,x_val,y_train,y_val):
from sklearn.metrics import f1_score
model = model
model.fit(x_train,y_train)
model_predTrain = model.predict(x_train)
model_predval = model.predict(x_val)
metrics(y_train,model_predTrain,y_val,model_predval)
print(f'The train score is ')
print(np.round(f1_score(y_train,model_predTrain),4))
print(f'The val score is ')
print(np.round(f1_score(y_val,model_predval),4))
return(model)
Models tested:
LR1 = LogisticRegression(random_state=10,solver='lbfgs').fit(x_train_robust[subset3],y_train_orig)
LR2 = LogisticRegression(class_weight='balanced').fit(x_train_orig[subset1],y_train_orig)
LR3 = LogisticRegression(class_weight='balanced').fit(x_train_stand[subset1],y_train_orig)
Best model:
print('Metrics for the best Logistic Regression')
Model_test(LR1, x_train_robust[subset3],x_val_robust[subset3],y_train_orig,y_val_orig)
Metrics for the best Logistic Regression
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.92 0.96 0.94 6757
1 0.71 0.57 0.63 1234
accuracy 0.90 7991
macro avg 0.82 0.76 0.79 7991
weighted avg 0.89 0.90 0.89 7991
[[6471 286]
[ 530 704]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.92 0.95 0.94 1690
1 0.69 0.56 0.62 308
accuracy 0.89 1998
macro avg 0.81 0.76 0.78 1998
weighted avg 0.89 0.89 0.89 1998
[[1613 77]
[ 136 172]]
The train score is
0.6331
The val score is
0.6176
LogisticRegression(random_state=10)
Models tested:
NB1 = GaussianNB().fit(x_train_out[subset4],y_train_out)
NB2 = GaussianNB().fit(x_train_orig[subset4],y_train_orig)
Best model:
print('Metrics for the best Gaussian Naive Bayes')
Model_test(NB1, x_train_orig[subset4],x_val_orig[subset4],y_train_orig,y_val_orig)
Metrics for the best Gaussian Naive Bayes
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.93 0.93 6757
1 0.63 0.66 0.64 1234
accuracy 0.89 7991
macro avg 0.78 0.79 0.79 7991
weighted avg 0.89 0.89 0.89 7991
[[6275 482]
[ 418 816]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.93 0.93 1690
1 0.62 0.67 0.64 308
accuracy 0.89 1998
macro avg 0.78 0.80 0.79 1998
weighted avg 0.89 0.89 0.89 1998
[[1564 126]
[ 102 206]]
The train score is
0.6445
The val score is
0.6438
GaussianNB()
Adapted function to calculate optimal number of neighbors and f1-score
numberK_list=np.arange(1,21)
high_score=0
nof=0
score_list_train =[]
score_list_val =[]
for n in numberK_list:
model = KNeighborsClassifier(n_neighbors = n).fit(x_train_norm[subset1], y_train_orig)
labels_train = model.predict(x_train_norm[subset1])
labels_val = model.predict(x_val_norm[subset1])
score_train = f1_score(y_train_orig, labels_train)
score_val = f1_score(y_val_orig, labels_val)
score_list_train.append(score_train)
score_list_val.append(score_val)
if(score_val>high_score):
high_score = score_val
nof = numberK_list[n-1]
print("Best number of neighbors: %d" %nof)
print("F1 in train with %d neighbors: %f" % (nof, score_list_train[nof-1]))
print("F1 in validation with %d neighbors: %f" % (nof, high_score))
Best number of neighbors: 19 F1 in train with 19 neighbors: 0.673043 F1 in validation with 19 neighbors: 0.663230
Models tested:
KNN1 = KNeighborsClassifier(n_neighbors=19).fit(x_train_norm[subset1],y_train_orig)
KNN2 = KNeighborsClassifier(n_neighbors=19).fit(x_train_norm_out[subset1],y_train_out)
KNN3= KNeighborsClassifier(n_neighbors=15).fit(x_train_robust_out[subset3],y_train_out)
KNN4 = KNeighborsClassifier(n_neighbors=19, metric = 'manhattan').fit(x_train_stand[subset4],y_train_orig)
KNN5 = KNeighborsClassifier(n_neighbors=19).fit(x_train_stand_out[subset3],y_train_out)
Best model:
Model_test(KNN4, x_train_stand[subset4],x_val_stand[subset4],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.94 6757
1 0.72 0.61 0.66 1234
accuracy 0.90 7991
macro avg 0.83 0.78 0.80 7991
weighted avg 0.90 0.90 0.90 7991
[[6472 285]
[ 486 748]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.94 1690
1 0.71 0.62 0.66 308
accuracy 0.90 1998
macro avg 0.82 0.79 0.80 1998
weighted avg 0.90 0.90 0.90 1998
[[1614 76]
[ 118 190]]
The train score is
0.6599
The val score is
0.662
KNeighborsClassifier(metric='manhattan', n_neighbors=19)
precision_recall_curve (KNN4,'points', x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.315789, F-Score=0.690
parameter_space = {
'max_depth': [5,6,7,8,9,10],
'criterion': ['gini', 'entropy'],
'splitter': ['random', 'best'],
'max_features': ['sqrt', None],
'min_samples_leaf': [600,650,700,750],
'min_samples_split':[80,90,95,100,None],
'min_weight_fraction_leaf':[0.15,0.2,0.25,None],
'min_impurity_decrease':[0.015,0.02,0.025,None],
'random_state':[10]
}
DT1 = DecisionTreeClassifier(max_depth= 2,min_weight_fraction_leaf=0.05, class_weight='balanced',random_state=10).fit(x_train_stand_out[subset1], y_train_out)
DT2 = DecisionTreeClassifier(max_depth= 2,min_weight_fraction_leaf=0.05, class_weight='balanced',random_state=10).fit(x_train_stand_out[subset2], y_train_out)
DT3 = DecisionTreeClassifier(max_depth= 2,min_weight_fraction_leaf=0.03, class_weight='balanced',random_state=10).fit(x_train_stand_out[subset3], y_train_out)
DT4 = DecisionTreeClassifier(max_depth= 2,min_weight_fraction_leaf=0.02, class_weight='balanced',random_state=10).fit(x_train_stand_out[subset4], y_train_out)
DT5 = DecisionTreeClassifier(max_depth= 2,min_weight_fraction_leaf=0.02, class_weight='balanced',random_state=10).fit(x_train_stand_out[subset5], y_train_out)
DT6 = DecisionTreeClassifier(criterion='entropy',splitter='best',min_samples_split=60,min_weight_fraction_leaf=0.15,random_state=10).fit(x_train_stand_out[subset3], y_train_out)
DT7 = DecisionTreeClassifier(criterion='entropy',splitter='best',min_samples_split=5,min_weight_fraction_leaf=0.15,random_state=10).fit(x_train_robust_out[subset4],y_train_out)
dt_alpha = DecisionTreeClassifier(random_state=42)
path = dt_alpha.cost_complexity_pruning_path(x_train_stand_out, y_train_out)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots(figsize = (10,10))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='x', drawstyle="steps-post")
ax.set_xlabel("Effective alpha", fontsize=20)
ax.set_ylabel("Total Impurity of leaves", fontsize=25)
ax.set_title("Total Impurity vs Alpha for training set", fontsize=25)
Text(0.5, 1.0, 'Total Impurity vs Alpha for training set')
# The function below only accepts values higher than 0
ccp_alphas=ccp_alphas[ccp_alphas>0]
trees = []
for ccp_alpha in ccp_alphas:
dt_alpha = DecisionTreeClassifier(random_state=10, ccp_alpha=ccp_alpha).fit(x_train_stand_out,y_train_out)
trees.append(dt_alpha)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(trees[-1].tree_.node_count, ccp_alphas[-1]))
Number of nodes in the last tree is: 1 with ccp_alpha: 0.09266368163349964
trees = trees[:-1]
ccp_alphas = ccp_alphas[:-1]
train_scores = [f1_score(y_train_out,tree.predict(x_train_stand_out)) for tree in trees]
val_scores = [f1_score(y_val_out,tree.predict(x_val_stand_out)) for tree in trees]
fig, ax = plt.subplots(figsize = (10,10))
ax.set_xlabel("Alpha", fontsize=15)
ax.set_ylabel("F1_Score", fontsize=15)
ax.set_title("F1_Score vs Alpha for training and validation sets")
ax.plot(ccp_alphas, train_scores, marker='x', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, val_scores, marker='x', label="validation", drawstyle="steps-post")
ax.legend()
plt.show()
index_best_model = np.argmax(val_scores)
best_model = trees[index_best_model]
print('ccp_alpha of best model: ',trees[index_best_model])
print('_____________________________________________________________')
print('Training f1_score of best model: ',f1_score(y_train_out,best_model.predict(x_train_stand_out)))
print('Validation f1_score of best model: ',f1_score(y_val_out,best_model.predict(x_val_stand_out)))
ccp_alpha of best model: DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10) _____________________________________________________________ Training f1_score of best model: 0.7374301675977655 Validation f1_score of best model: 0.6726943942133815
ccp_alpha1=stratKfold(x_train_stand_out[subset1],y_train_out, DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10))
ccp_alpha2=stratKfold(x_train_stand_out[subset2],y_train_out, DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10))
ccp_alpha3=stratKfold(x_train_stand_out[subset3],y_train_out, DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10))
ccp_alpha4=stratKfold(x_train_stand_out[subset4],y_train_out, DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10))
ccp_alpha5=stratKfold(x_train_stand_out[subset5],y_train_out, DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10))
dt_alpha=DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10).fit(x_train_stand_out[subset1], y_train_out)
Best model:
Model_test(DT6,x_train_stand_out[subset3],x_val_stand_out[subset3],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.96 0.89 0.92 6729
1 0.56 0.81 0.66 1190
accuracy 0.87 7919
macro avg 0.76 0.85 0.79 7919
weighted avg 0.90 0.87 0.88 7919
[[5966 763]
[ 232 958]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.97 0.89 0.93 1682
1 0.57 0.84 0.68 298
accuracy 0.88 1980
macro avg 0.77 0.86 0.80 1980
weighted avg 0.91 0.88 0.89 1980
[[1490 192]
[ 48 250]]
The train score is
0.6582
The val score is
0.6757
DecisionTreeClassifier(criterion='entropy', min_samples_split=60,
min_weight_fraction_leaf=0.15, random_state=10)
precision_recall_curve(DT6,'dt6',x_train_stand_out[subset3],y_train_out,x_val_stand_out[subset3],y_val_out)
Best Threshold=0.556653, F-Score=0.676
Model_test(dt_alpha,x_train_stand_out[subset1],x_val_stand_out[subset1],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.97 0.95 6729
1 0.76 0.62 0.68 1190
accuracy 0.91 7919
macro avg 0.85 0.79 0.82 7919
weighted avg 0.91 0.91 0.91 7919
[[6500 229]
[ 451 739]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.94 1682
1 0.70 0.60 0.65 298
accuracy 0.90 1980
macro avg 0.82 0.78 0.80 1980
weighted avg 0.90 0.90 0.90 1980
[[1607 75]
[ 119 179]]
The train score is
0.6849
The val score is
0.6486
DecisionTreeClassifier(ccp_alpha=0.00043173619382736697, random_state=10)
precision_recall_curve(dt_alpha,'dt_ccp_alpha',x_train_stand_out[subset1], y_train_out,x_val_stand_out[subset1],y_val_out)
Best Threshold=0.213389, F-Score=0.678
Parameter search space
# model = MLPClassifier(random_state=10)
# parameter_space1 = {
# 'hidden_layer_sizes': [(10),(100),(150),(200)],
# 'tol':[ 0.00001],
# 'n_iter_no_change': [14],
# 'learning_rate':['constant','invscaling','adaptive'],
# 'activation': ['logistic',None,'tanh'],
# 'solver': ['adam','sgd'],
# 'learning_rate_init': [0.003,0.0025,0.001],
# 'batch_size': [(150),'auto'],
# 'max_iter': [(250),(150),(200)],
# 'epsilon':[0.000008,None],
# 'early_stopping':[True,False],
# 'validation_fraction':[0.2,None,0.15]
# 'beta_1':[0.95,0.85,0.99,None],
# 'beta_2':[0.93,0.85,None]
# }
# clf1 = GridSearchCV(model, parameter_space1,n_jobs=-1,scoring='f1')
Models tested:
NN1 = MLPClassifier(hidden_layer_sizes=(8),max_iter = 300, random_state = 10, learning_rate_init = 0.1, batch_size = 80, tol = 0.01).fit(x_train_stand[subset4],y_train_orig)
NN2 = MLPClassifier(random_state = 10, hidden_layer_sizes = (200)).fit(x_train_stand[subset1],y_train_orig)
NN3 = MLPClassifier(random_state = 10, hidden_layer_sizes = (150), max_iter = 250, learning_rate_init = 0.1, tol =0.001).fit(x_train_stand[subset2],y_train_orig)
NN4 = MLPClassifier(random_state = 10,batch_size = 250,learning_rate_init = 0.1,tol =0.01, n_iter_no_change = 15).fit(x_train_stand[subset3],y_train_orig)
NN5 = MLPClassifier(random_state = 10, hidden_layer_sizes = (150), tol = 0.00001, n_iter_no_change = 14).fit(x_train_robust[subset5],y_train_orig)
NN6 = MLPClassifier(random_state = 10).fit(x_train_robust[subset5],y_train_orig)
NN7 = MLPClassifier(activation= 'logistic',batch_size= 10, hidden_layer_sizes= (6,6),learning_rate_init=0.0017,max_iter=500 ,solver='adam',random_state=4,beta_1=0.99,epsilon=0.00005).fit(x_train_robust_out[subset4], y_train_out)
NN8 = MLPClassifier(activation = 'logistic',solver = 'adam',hidden_layer_sizes=(10,10,10), random_state = 4, learning_rate_init = 0.0007,max_iter=40, batch_size = 40,beta_1=0.95,beta_2=0.85,learning_rate='constant').fit(x_train_robust_out[subset4], y_train_out)
NN9 = MLPClassifier(activation = 'tanh',solver = 'adam',hidden_layer_sizes=(8), random_state = 10, learning_rate_init = 0.008, batch_size = 150).fit(x_train_robust_out[subset4],y_train_out)
NN10 = MLPClassifier(n_iter_no_change =14,batch_size=160,learning_rate_init = 0.0012,alpha=0.05,random_state = 10, hidden_layer_sizes = (160)).fit(x_train_robust_out[subset5],y_train_out)
NN11 = MLPClassifier(beta_2=0.85,learning_rate_init=0.0008,solver='adam',random_state = 10, hidden_layer_sizes = (100)).fit(x_train_robust[subset5],y_train_orig)
Best model:
Model_test(NN9, x_train_robust_out[subset4],x_val_robust_out[subset4],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.95 0.93 0.94 6729
1 0.65 0.74 0.69 1190
accuracy 0.90 7919
macro avg 0.80 0.84 0.82 7919
weighted avg 0.91 0.90 0.90 7919
[[6255 474]
[ 308 882]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.95 0.92 0.94 1682
1 0.64 0.74 0.69 298
accuracy 0.90 1980
macro avg 0.79 0.83 0.81 1980
weighted avg 0.91 0.90 0.90 1980
[[1555 127]
[ 76 222]]
The train score is
0.6929
The val score is
0.6862
MLPClassifier(activation='tanh', batch_size=150, hidden_layer_sizes=8,
learning_rate_init=0.008, random_state=10)
precision_recall_curve(NN9,'Neural Network',x_train_robust_out[subset4],y_train_out,x_val_robust_out[subset4],y_val_out)
Best Threshold=0.517165, F-Score=0.691
Models tested:
PAC1 = PassiveAggressiveClassifier(random_state=10).fit(x_train_robust_out[subset1],y_train_out)
PAC2 = PassiveAggressiveClassifier(random_state=10,loss='squared_hinge',C=0.5,n_iter_no_change=7).fit(x_train_robust_out[subset1],y_train_out)
PAC3 = PassiveAggressiveClassifier(random_state=10,loss='squared_hinge',C=2,n_iter_no_change=9).fit(x_train_robust_out[subset1],y_train_out)
Best model:
Model_test(PAC2, x_train_robust_out[subset1],x_val_robust_out[subset1],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.96 0.91 0.93 6729
1 0.60 0.77 0.67 1190
accuracy 0.89 7919
macro avg 0.78 0.84 0.80 7919
weighted avg 0.90 0.89 0.89 7919
[[6109 620]
[ 275 915]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.96 0.91 0.93 1682
1 0.60 0.79 0.68 298
accuracy 0.89 1980
macro avg 0.78 0.85 0.81 1980
weighted avg 0.91 0.89 0.89 1980
[[1524 158]
[ 63 235]]
The train score is
0.6716
The val score is
0.6802
PassiveAggressiveClassifier(C=0.5, loss='squared_hinge', n_iter_no_change=7,
random_state=10)
Used only for subset4 and subset5, since the objective was to combine it with the ensemble later on
Parameter search space:
# # For subset4
# grid = dict()
# grid['reg_param'] = np.arange(0, 1, 0.01)
# # Define search
# search = GridSearchCV(modelQDA, grid, scoring='f1', n_jobs=-1)
# # Perform the search
# results = search.fit(x_train_robust_out[subset4], y_train_out)
# # Summarize
# print('Config: %s' % results.best_params_)
# # For subset5
# # Define grid
# grid = dict()
# grid['reg_param'] = np.arange(0.5, 1, 0.01)
# # Define search
# search = GridSearchCV(modelQDA, grid, scoring='f1', n_jobs=-1)
# # Perform the search
# results = search.fit(x_train_robust[subset5], y_train_orig)
# # Summarize
# print('Config: %s' % results.best_params_)
Models tested:
QDA1 = QuadraticDiscriminantAnalysis(reg_param=0.9400000000000001).fit(x_train_robust_out[subset4],y_train_out)
QDA2 = QuadraticDiscriminantAnalysis(reg_param=0.99).fit(x_train_robust[subset5],y_train_orig)
Best model:
Model_test(QDA1,x_train_robust_out[subset4],x_val_robust_out[subset4],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.94 0.94 6729
1 0.64 0.65 0.65 1190
accuracy 0.89 7919
macro avg 0.79 0.79 0.79 7919
weighted avg 0.89 0.89 0.89 7919
[[6306 423]
[ 422 768]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.94 0.94 1682
1 0.65 0.68 0.67 298
accuracy 0.90 1980
macro avg 0.80 0.81 0.80 1980
weighted avg 0.90 0.90 0.90 1980
[[1573 109]
[ 94 204]]
The train score is
0.6451
The val score is
0.6678
QuadraticDiscriminantAnalysis(reg_param=0.9400000000000001)
precision_recall_curve(QuadraticDiscriminantAnalysis(reg_param=0.9400000000000001),'QDA',x_train_robust_out[subset4],y_train_out,x_val_robust_out[subset4],y_val_out)
Best Threshold=0.001596, F-Score=0.683
Model_test(QDA2,x_train_robust[subset5],x_val_robust[subset5],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.93 0.93 6757
1 0.64 0.65 0.64 1234
accuracy 0.89 7991
macro avg 0.79 0.79 0.79 7991
weighted avg 0.89 0.89 0.89 7991
[[6307 450]
[ 437 797]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.93 0.94 1690
1 0.64 0.67 0.65 308
accuracy 0.89 1998
macro avg 0.79 0.80 0.80 1998
weighted avg 0.89 0.89 0.89 1998
[[1577 113]
[ 103 205]]
The train score is
0.6425
The val score is
0.655
QuadraticDiscriminantAnalysis(reg_param=0.99)
precision_recall_curve(QuadraticDiscriminantAnalysis(reg_param=0.99),'QDA',x_train_robust[subset5],y_train_orig,x_val_robust[subset5],y_val_orig)
Best Threshold=0.000005, F-Score=0.667
Note: the method for tuning the parameters was the same as in practical classes, and not to burden the notebook it was excluded
Models tested:
SVM1 = SVC(C=0.4,kernel='rbf',random_state=10,probability=True).fit(x_train_stand[subset4],y_train_orig)
SVM2 = SVC(C=0.05,random_state=10,probability=True).fit(x_train_stand[subset4],y_train_orig)
SVM3 = SVC(C=0.4, probability=True,random_state=10).fit(x_train_stand[subset4],y_train_orig)
Best model:
Model_test(SVM1,x_train_stand[subset4],x_val_stand[subset4],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.95 0.94 6757
1 0.71 0.62 0.66 1234
accuracy 0.90 7991
macro avg 0.82 0.79 0.80 7991
weighted avg 0.90 0.90 0.90 7991
[[6439 318]
[ 470 764]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.95 0.94 1690
1 0.70 0.63 0.67 308
accuracy 0.90 1998
macro avg 0.82 0.79 0.80 1998
weighted avg 0.90 0.90 0.90 1998
[[1607 83]
[ 113 195]]
The train score is
0.6598
The val score is
0.6655
SVC(C=0.4, probability=True, random_state=10)
precision_recall_curve(SVM1,'SVM',x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.074427, F-Score=0.704
ensemble_clfs = {
"Bagging, max_samples=0.2": BaggingClassifier( oob_score=True, max_samples=0.2, random_state=15),
"Bagging, max_samples=0.5": BaggingClassifier( oob_score=True, max_samples=0.5, random_state=15),
"Bagging, max_samples=0.7": BaggingClassifier( oob_score=True, max_samples=0.7, random_state=15),
}
min_estimators = 150
max_estimators = 210
error_rate = dict((label, []) for label in ensemble_clfs)
for label, clf in ensemble_clfs.items():
for i in range(min_estimators, max_estimators + 1):
clf.set_params(n_estimators=i)
clf.fit(x_train_stand_out[subset5] , y_train_out)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = 1 - clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.legend(loc="upper right")
plt.show()
knn = KNeighborsClassifier()
BAG1 = BaggingClassifier(base_estimator=knn,bootstrap = False,max_samples=0.6,max_features=0.8,n_estimators = 100, random_state=15).fit(x_train_robust_out[subset1] , y_train_out)
BAG2 = BaggingClassifier(base_estimator=knn,bootstrap = False,max_samples=0.6,max_features=0.8,n_estimators = 100, random_state=15).fit(x_train_robust[subset1] , y_train_orig)
BAG3 = BaggingClassifier(base_estimator=knn,bootstrap = True,max_samples=0.3,max_features=0.85,n_estimators = 300, random_state=15).fit(x_train_robust[subset1] , y_train_orig)
BAG4 = BaggingClassifier(bootstrap = True,oob_score=True,max_samples=0.6,max_features=0.8,n_estimators = 100, random_state=15).fit(x_train_robust_out[subset5] , y_train_out)
BAG5 = BaggingClassifier(bootstrap = True,oob_score=True,bootstrap_features=True,max_samples=0.6,max_features=0.8,n_estimators = 100, random_state=15).fit(x_train_robust_out[subset5] , y_train_out)
BAG6 = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth= 5,min_impurity_decrease=0.01,min_samples_split= 70,min_weight_fraction_leaf=0.05, class_weight='balanced',random_state=42),bootstrap = True,bootstrap_features=True,n_estimators=100).fit(x_train_norm_out[subset3], y_train_out)
BAG7 = BaggingClassifier(n_estimators=193,oob_score=True,max_samples=0.2,random_state=1).fit(x_train_stand_out[subset5], y_train_out)
BAG8 = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=95, max_depth=4, max_features = 4, random_state = 42), n_estimators=200,oob_score=0.1,random_state=1).fit(x_train_stand[subset5], y_train_orig)
BAG9 = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=200,oob_score=0.1,random_state=1).fit(x_train_stand[subset5], y_train_orig)
Best model:
Model_test(BAG8,x_train_stand[subset5],x_val_stand[subset5],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.95 6757
1 0.75 0.60 0.67 1234
accuracy 0.91 7991
macro avg 0.84 0.78 0.81 7991
weighted avg 0.90 0.91 0.90 7991
[[6504 253]
[ 491 743]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.95 0.94 1690
1 0.70 0.59 0.64 308
accuracy 0.90 1998
macro avg 0.81 0.77 0.79 1998
weighted avg 0.89 0.90 0.89 1998
[[1612 78]
[ 127 181]]
The train score is
0.6664
The val score is
0.6384
BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=4,
max_features=4,
n_estimators=95,
random_state=42),
n_estimators=200, oob_score=0.1, random_state=1)
precision_recall_curve(BAG8,'BG',x_train_stand[subset5],y_train_orig,x_val_stand[subset5],y_val_orig)
Best Threshold=0.415383, F-Score=0.703
ensemble_clfs = [
("RandomForestClassifier, max_features=4",
RandomForestClassifier(oob_score=True,
max_features=4,
random_state=10)),
("RandomForestClassifier, max_features='log2'",
RandomForestClassifier(max_features='log2',
oob_score=True,
random_state=10)),
("RandomForestClassifier, max_features='sqrt'",
RandomForestClassifier(max_features='sqrt',
oob_score=True,
random_state=10)),
("RandomForestClassifier, max_features=None",
RandomForestClassifier(max_features=None,
oob_score=True,
random_state=10))
]
# Best combinations for DT and RFC, testing for different parameters
RF1 = RandomForestClassifier(min_samples_split=300,min_weight_fraction_leaf=0.01,random_state=10).fit(x_train_stand[subset3], y_train_orig)
RF2 = RandomForestClassifier(ccp_alpha=0.00032, random_state=10).fit(x_train_stand_out[subset5],y_train_out)
RF3 = RandomForestClassifier(ccp_alpha=0.0020109, random_state=10).fit(x_train_stand,y_train_orig)
RF4 = RandomForestClassifier(ccp_alpha=0.00032, random_state=10).fit(x_train_stand[subset5],y_train_orig)
RF5 = RandomForestClassifier(max_depth = 4, min_weight_fraction_leaf = 0.15, random_state=10).fit(x_train_stand[subset5], y_train_orig)
RF6 = RandomForestClassifier(n_estimators=100, max_depth=4, random_state = 10).fit(x_train_stand[subset5], y_train_orig)
RF7 = RandomForestClassifier(n_estimators=98, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset5], y_train_orig)
RF8 = RandomForestClassifier(n_estimators=100, max_depth=4, random_state = 10).fit(x_train_stand[subset3], y_train_orig)
RF9 = RandomForestClassifier(n_estimators=95, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset3], y_train_orig)
RF10 = RandomForestClassifier(n_estimators=199, max_depth=6, max_features = 'sqrt', random_state = 10).fit(x_train_stand[subset5], y_train_orig)
RF11 = RandomForestClassifier(n_estimators=199, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset4], y_train_orig)
RF12 = RandomForestClassifier(n_estimators=199, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset3], y_train_orig)
RF13 = RandomForestClassifier(n_estimators=199, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset2], y_train_orig)
RF14 = RandomForestClassifier(n_estimators=199, max_depth=4, max_features = 4, random_state = 10).fit(x_train_stand[subset1], y_train_orig)
RF15 = RandomForestClassifier(max_features=6, min_samples_split=100, n_estimators=10,oob_score=True, random_state=0).fit(x_train_stand_out[subset4],y_train_out)
RF16 = RandomForestClassifier(max_features=0.8, min_samples_split=25,min_samples_leaf=25, n_estimators=200,max_depth=4,oob_score=True,random_state=0).fit(x_train_stand_out[subset4],y_train_out)
Best model:
Model_test(RF15,x_train_stand_out[subset4],x_val_stand_out[subset4],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.96 0.95 6729
1 0.75 0.65 0.70 1190
accuracy 0.91 7919
macro avg 0.84 0.81 0.82 7919
weighted avg 0.91 0.91 0.91 7919
[[6466 263]
[ 411 779]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.96 0.95 1682
1 0.72 0.64 0.68 298
accuracy 0.91 1980
macro avg 0.83 0.80 0.81 1980
weighted avg 0.90 0.91 0.91 1980
[[1608 74]
[ 107 191]]
The train score is
0.698
The val score is
0.6785
RandomForestClassifier(max_features=6, min_samples_split=100, n_estimators=10,
oob_score=True, random_state=0)
precision_recall_curve(RF15,'RF',x_train_stand_out[subset4],y_train_out,x_val_stand_out[subset4],y_val_out)
Best Threshold=0.453868, F-Score=0.693
The optimal parameters were found using the same method as in practical classes, but excluded here to improve readability
Models tested:
ABC1 = AdaBoostClassifier(n_estimators=11,learning_rate=0.8, random_state=10).fit(x_train_norm[subset4],y_train_orig)
ABC2 = AdaBoostClassifier(n_estimators=14,learning_rate=0.7, random_state=10).fit(x_train_robust_out[subset4],y_train_out)
ABC3 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=2),n_estimators=26,learning_rate = 0.05, random_state = 10).fit(x_train_robust[subset5],y_train_orig)
ABC4 = AdaBoostClassifier(n_estimators=12,learning_rate=0.05, random_state=10).fit(x_train_robust_out[subset4],y_train_out)
Best model:
Model_test(ABC4,x_train_robust_out[subset4],x_val_robust_out[subset4],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.96 0.91 0.93 6729
1 0.59 0.76 0.66 1190
accuracy 0.88 7919
macro avg 0.77 0.83 0.80 7919
weighted avg 0.90 0.88 0.89 7919
[[6095 634]
[ 285 905]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.96 0.90 0.93 1682
1 0.59 0.79 0.68 298
accuracy 0.89 1980
macro avg 0.78 0.84 0.80 1980
weighted avg 0.90 0.89 0.89 1980
[[1521 161]
[ 64 234]]
The train score is
0.6632
The val score is
0.6753
AdaBoostClassifier(learning_rate=0.05, n_estimators=12, random_state=10)
precision_recall_curve(ABC4,'ABC',x_train_robust_out[subset4],y_train_out,x_val_robust_out[subset4],y_val_out)
Best Threshold=0.335246, F-Score=0.679
Models tested:
rf = RandomForestClassifier(random_state=10)
GB1 = GradientBoostingClassifier(n_estimators=20, max_depth=6, max_features=5,min_samples_split=250,min_samples_leaf=200,learning_rate=0.05,min_impurity_decrease=0.01,validation_fraction=0.3,init=rf).fit(x_train_stand[subset4], y_train_orig)
GB2 = GradientBoostingClassifier(learning_rate=1.1500000000000004, loss='exponential',max_depth=2, max_features='sqrt', n_estimators=80,criterion='friedman_mse').fit(x_train_stand[subset4], y_train_orig)
GB3 = GradientBoostingClassifier(learning_rate=0.15000000000000002, loss='exponential',max_depth=2, max_features='sqrt', n_estimators=115).fit(x_train_stand[subset4], y_train_orig)
GB4 = GradientBoostingClassifier(learning_rate=0.25000000000000006, loss='exponential',max_depth=3, max_features='log2', n_estimators=25).fit(x_train_stand[subset3], y_train_orig).fit(x_train_stand[subset3], y_train_orig)
GB5 = GradientBoostingClassifier(learning_rate= 1.1500000000000004, loss='exponential', max_depth=2, max_features='sqrt',n_estimators=80).fit(x_train_stand[subset1], y_train_orig)
GB6 = GradientBoostingClassifier(learning_rate= 0.8, loss='exponential', max_depth=3, max_features='sqrt',n_estimators=50).fit(x_train_stand[subset2], y_train_orig)
GB7 = GradientBoostingClassifier(learning_rate= 1.0000000000000002, loss='exponential', max_depth=2, max_features='sqrt',n_estimators=100, random_state=5).fit(x_train_stand[subset1], y_train_orig)
GB8 = GradientBoostingClassifier(learning_rate= 1.00, loss='exponential', max_depth=2, max_features='sqrt',n_estimators=30, random_state=5).fit(x_train_stand[subset2], y_train_orig)
GB9 = GradientBoostingClassifier(learning_rate= 1.25, loss='exponential', max_depth=2, max_features='sqrt',n_estimators=30, random_state=5).fit(x_train_stand[subset1], y_train_orig)
Best model:
Model_test(GB9,x_train_stand[subset1],x_val_stand[subset1],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.95 6757
1 0.75 0.59 0.66 1234
accuracy 0.91 7991
macro avg 0.84 0.78 0.80 7991
weighted avg 0.90 0.91 0.90 7991
[[6516 241]
[ 508 726]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.94 1690
1 0.73 0.58 0.65 308
accuracy 0.90 1998
macro avg 0.83 0.77 0.80 1998
weighted avg 0.90 0.90 0.90 1998
[[1625 65]
[ 128 180]]
The train score is
0.6597
The val score is
0.651
GradientBoostingClassifier(learning_rate=1.25, loss='exponential', max_depth=2,
max_features='sqrt', n_estimators=30,
random_state=5)
precision_recall_curve(GB9,'GB',x_train_stand[subset1],y_train_orig,x_val_stand[subset1],y_val_orig)
Best Threshold=0.335239, F-Score=0.694
For Subset4 with robust scalling without outliers
estimators_4 = [
('NN9',MLPClassifier(activation = 'tanh',solver = 'adam',hidden_layer_sizes=(8), random_state = 10, learning_rate_init = 0.008, batch_size = 150)),
('QDA1', QuadraticDiscriminantAnalysis(reg_param=0.9400000000000001)),
#('KNN3',KNeighborsClassifier(n_neighbors=15))]
('DT7', DecisionTreeClassifier(criterion='entropy',splitter='best',min_samples_split=5,min_weight_fraction_leaf=0.15,random_state=42))]
# Tested all possible combinations
# all 4
# 0.6929824561403508
# 0.6876971608832808
# nn+qda+knn
# 0.6928158546655657
# 0.6819672131147541
# nn+qda+dt - best one
# 0.6892523364485981
# 0.6871165644171779
# nn+knn+dt
# 0.6955140928940056
# 0.6834645669291338
# nn+qda
# 0.6892700438771441
# 0.6854460093896715
# nn+knn
# 0.6911643270024773
# 0.6819672131147541
# nn+dt
# 0.6895208414491624
# 0.6860643185298623
SC1 = StackingClassifier(estimators = estimators_4,final_estimator=LogisticRegression()).fit(x_train_robust_out[subset4],y_train_out)
Model_test(SC1,x_train_robust_out[subset4],x_val_robust_out[subset4],y_train_out,y_val_out)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.95 0.93 0.94 6729
1 0.64 0.74 0.69 1190
accuracy 0.90 7919
macro avg 0.80 0.84 0.81 7919
weighted avg 0.91 0.90 0.90 7919
[[6236 493]
[ 305 885]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.95 0.92 0.94 1682
1 0.63 0.75 0.69 298
accuracy 0.90 1980
macro avg 0.79 0.84 0.81 1980
weighted avg 0.91 0.90 0.90 1980
[[1552 130]
[ 74 224]]
The train score is
0.6893
The val score is
0.6871
StackingClassifier(estimators=[('NN9',
MLPClassifier(activation='tanh', batch_size=150,
hidden_layer_sizes=8,
learning_rate_init=0.008,
random_state=10)),
('QDA1',
QuadraticDiscriminantAnalysis(reg_param=0.9400000000000001)),
('DT7',
DecisionTreeClassifier(criterion='entropy',
min_samples_split=5,
min_weight_fraction_leaf=0.15,
random_state=42))],
final_estimator=LogisticRegression())
precision_recall_curve(SC1,'SC1',x_train_robust_out[subset4],y_train_out,x_val_robust_out[subset4],y_val_out)
Best Threshold=0.313169, F-Score=0.696
For Subset5 with robust scalling with outliers
estimators_5 = [
('NN5', MLPClassifier(random_state = 10, hidden_layer_sizes = (150), tol = 0.00001, n_iter_no_change = 14)),
('RF_BAG8', RandomForestClassifier(n_estimators=95, max_depth=4, max_features = 4, random_state = 42)),
('QDA2',QuadraticDiscriminantAnalysis(reg_param=0.99))]
# Tested all possible combinations
# all 3 - the best
# 0.6967854039965248
# 0.6824324324324325
# nn + rf
# 0.6950108459869849
# 0.6791171477079797
# nn + qd2
# 0.6950959488272922
# 0.6767169179229482
# rf + qd2
# 0.663339382940109
# 0.6238859180035651
SC2 = StackingClassifier(estimators = estimators_5,final_estimator=LogisticRegression()).fit(x_train_robust[subset5],y_train_orig)
Model_test(SC2,x_train_robust[subset5],x_val_robust[subset5],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.96 0.95 6757
1 0.75 0.65 0.70 1234
accuracy 0.91 7991
macro avg 0.84 0.81 0.82 7991
weighted avg 0.91 0.91 0.91 7991
[[6491 266]
[ 432 802]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.95 0.94 1690
1 0.71 0.66 0.68 308
accuracy 0.91 1998
macro avg 0.82 0.80 0.81 1998
weighted avg 0.90 0.91 0.90 1998
[[1608 82]
[ 106 202]]
The train score is
0.6968
The val score is
0.6824
StackingClassifier(estimators=[('NN5',
MLPClassifier(hidden_layer_sizes=150,
n_iter_no_change=14,
random_state=10, tol=1e-05)),
('RF_BAG8',
RandomForestClassifier(max_depth=4,
max_features=4,
n_estimators=95,
random_state=42)),
('QDA2',
QuadraticDiscriminantAnalysis(reg_param=0.99))],
final_estimator=LogisticRegression())
precision_recall_curve(SC2,'stacking',x_train_robust[subset5],y_train_orig,x_val_robust[subset5],y_val_orig)
Best Threshold=0.296030, F-Score=0.707
For Subset4 with standard scalling with outliers
estimators_4orig = [
#('BAG8',BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=95,max_depth=4,max_features = 4), n_estimators=200,oob_score=0.1,random_state=10)),
('SVM3',SVC(C=0.4, probability=True,random_state=10)),
#('RF15',RandomForestClassifier(max_features=6, min_samples_split=100, n_estimators=10,oob_score=True, random_state=0)),
('KNN4',KNeighborsClassifier(n_neighbors=19, metric = 'manhattan')),
('NN1',MLPClassifier(hidden_layer_sizes=(8),max_iter = 300, random_state= 10, learning_rate_init = 0.1, batch_size = 80, tol = 0.01))]
# Tested all possible combinations
# All 5
# 0.6676
# 0.656
# svm,rf,knn,nn
# 0.6764
# 0.6596
# bg,rf,knn,nn
# 0.6746
# 0.6445
# rf,knn,nn
# 0.6766
# 0.6537
# svm,knn,nn - best one
# 0.6693
# 0.6689
# svm,rf,nn
# 0.6823
# 0.6619
# svm,rf,knn
# 0.6685
# 0.6521
SC3 = StackingClassifier(estimators = estimators_4orig,final_estimator=LogisticRegression()).fit(x_train_stand[subset4],y_train_orig)
Model_test(SC3,x_train_stand[subset4],x_val_stand[subset4],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.95 0.94 6757
1 0.72 0.63 0.67 1234
accuracy 0.90 7991
macro avg 0.82 0.79 0.81 7991
weighted avg 0.90 0.90 0.90 7991
[[6448 309]
[ 458 776]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.94 0.95 0.94 1690
1 0.70 0.64 0.67 308
accuracy 0.90 1998
macro avg 0.82 0.79 0.81 1998
weighted avg 0.90 0.90 0.90 1998
[[1606 84]
[ 111 197]]
The train score is
0.6693
The val score is
0.6689
StackingClassifier(estimators=[('SVM3',
SVC(C=0.4, probability=True, random_state=10)),
('KNN4',
KNeighborsClassifier(metric='manhattan',
n_neighbors=19)),
('NN1',
MLPClassifier(batch_size=80,
hidden_layer_sizes=8,
learning_rate_init=0.1,
max_iter=300, random_state=10,
tol=0.01))],
final_estimator=LogisticRegression())
precision_recall_curve(SC3,'stacking',x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.312008, F-Score=0.706
For Subset4 with standard scalling with outliers
estimators_4orig = [
('BAG8',BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=95,max_depth=4,max_features = 4), n_estimators=200,oob_score=0.1,random_state=10)),
('SVM3',SVC(C=0.4, probability=True,random_state=10)),
#('RF15',RandomForestClassifier(max_features=6, min_samples_split=100, n_estimators=10,oob_score=True, random_state=0)),
('KNN4',KNeighborsClassifier(n_neighbors=19, metric = 'manhattan')),
('NN1',MLPClassifier(hidden_layer_sizes=(8),max_iter = 300, random_state= 10, learning_rate_init = 0.1, batch_size = 80, tol = 0.01))]
# Tested all possible combinations
# All 5
# 0.6728
# 0.6598
# bg,svm,knn,nn --
# 0.6629
# 0.6667
# svm,rf,knn,nn
# 0.6655
# 0.6644
# bg,rf,knn,nn
# 0.6798
# 0.6667
# bg,svm,rf,nn
# 0.6693
# 0.6491
# bg,svm,rf,knn
# 0.6631
# 0.6643
# svm,rf,knn
# 0.6608
# 0.6573
# rf,knn,nn --
# 0.6834
# 0.6745
# svm,knn,nn
# 0.6574
# 0.6574
# svm,rf,nn
# 0.6664
# 0.6503
V1 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft').fit(x_train_stand[subset4],y_train_orig)
Model_test(V1,x_train_stand[subset4],x_val_stand[subset4],y_train_orig,y_val_orig)
___________________________________________________________________________________________________________
TRAIN
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.96 0.94 6757
1 0.72 0.62 0.66 1234
accuracy 0.90 7991
macro avg 0.82 0.79 0.80 7991
weighted avg 0.90 0.90 0.90 7991
[[6454 303]
[ 472 762]]
___________________________________________________________________________________________________________
VALIDATION
-----------------------------------------------------------------------------------------------------------
precision recall f1-score support
0 0.93 0.95 0.94 1690
1 0.71 0.63 0.67 308
accuracy 0.90 1998
macro avg 0.82 0.79 0.80 1998
weighted avg 0.90 0.90 0.90 1998
[[1610 80]
[ 114 194]]
The train score is
0.6629
The val score is
0.6667
VotingClassifier(estimators=[('BAG8',
BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=4,
max_features=4,
n_estimators=95),
n_estimators=200, oob_score=0.1,
random_state=10)),
('SVM3',
SVC(C=0.4, probability=True, random_state=10)),
('KNN4',
KNeighborsClassifier(metric='manhattan',
n_neighbors=19)),
('NN1',
MLPClassifier(batch_size=80, hidden_layer_sizes=8,
learning_rate_init=0.1,
max_iter=300, random_state=10,
tol=0.01))],
n_jobs=-1, voting='soft')
precision_recall_curve(V1,'V1',x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.333664, F-Score=0.707
Adjusting the weights given to each classifier
v1s_4w1 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[20,20,20,40]).fit(x_train_stand[subset4],y_train_orig)
print('W1: Train:',round(f1_score(y_train_orig,v1s_4w1.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w1.predict(x_val_stand[subset4])),4))
v1s_4w2 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,25,25,40]).fit(x_train_stand[subset4],y_train_orig)
print('W2: Train:',round(f1_score(y_train_orig,v1s_4w2.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w2.predict(x_val_stand[subset4])),4))
v1s_4w3 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,20,20,50]).fit(x_train_stand[subset4],y_train_orig)
print('W3: Train:',round(f1_score(y_train_orig,v1s_4w3.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w3.predict(x_val_stand[subset4])),4))
v1s_4w4 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,30,30,30]).fit(x_train_stand[subset4],y_train_orig)
print('W4: Train:',round(f1_score(y_train_orig,v1s_4w4.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w4.predict(x_val_stand[subset4])),4))
v1s_4w5 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,10,10,70]).fit(x_train_stand[subset4],y_train_orig)
print('W5: Train:',round(f1_score(y_train_orig,v1s_4w5.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w5.predict(x_val_stand[subset4])),4))
v1s_4w6 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,10,20,60]).fit(x_train_stand[subset4],y_train_orig)
print('W6: Train:',round(f1_score(y_train_orig,v1s_4w6.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w6.predict(x_val_stand[subset4])),4))
v1s_4w7 = VotingClassifier(estimators = estimators_4orig,n_jobs=-1,voting='soft',weights=[10,20,10,60]).fit(x_train_stand[subset4],y_train_orig)
print('W7: Train:',round(f1_score(y_train_orig,v1s_4w7.predict(x_train_stand[subset4])),4),'Validation:',round(f1_score(y_val_orig,v1s_4w7.predict(x_val_stand[subset4])),4))
W1: Train: 0.6649 Validation: 0.6633 W2: Train: 0.6606 Validation: 0.6644 W3: Train: 0.6661 Validation: 0.6621 W4: Train: 0.6585 Validation: 0.6644 W5: Train: 0.682 Validation: 0.686 W6: Train: 0.6789 Validation: 0.682 W7: Train: 0.6681 Validation: 0.6599
precision_recall_curve(v1s_4w1,'voting',x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.369622, F-Score=0.711
precision_recall_curve(v1s_4w5,'voting',x_train_stand[subset4],y_train_orig,x_val_stand[subset4],y_val_orig)
Best Threshold=0.444191, F-Score=0.704
For Subset5 with robust scalling with outliers
estimators_5 = [
('NN5', MLPClassifier(random_state = 10, hidden_layer_sizes = (150), tol = 0.00001, n_iter_no_change = 14)),
('RF_BAG8', RandomForestClassifier(n_estimators=95, max_depth=4, max_features = 4, random_state = 42))]
#('QDA2',QuadraticDiscriminantAnalysis(reg_param=0.99))]
# Tested all possible combinations
# c/3
# 0.654
# 0.6624
# nn+rf ---
# 0.702
# 0.6945
# nn+qda
# 0.6441
# 0.6539
# rf+qda
# 0.6427
# 0.6539
V2 = VotingClassifier(estimators = estimators_5,n_jobs=-1,voting='soft').fit(x_train_robust[subset5],y_train_orig)
precision_recall_curve(V2,'voting',x_train_robust[subset5],y_train_orig,x_val_robust[subset5],y_val_orig)
Best Threshold=0.380227, F-Score=0.703
test = pd.read_csv('test.csv')
def scalling (x_train,x_val,test,standModel):
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
x = pd.DataFrame(x_train.select_dtypes(include=np.number))
cat_features = x_train.select_dtypes(exclude=np.number).columns
num_features = x_train.select_dtypes(include=np.number).columns
if standModel == 'standardize':
scaler = StandardScaler().fit(x)
if standModel == 'normalize':
scaler = MinMaxScaler(feature_range=(-1,1)).fit(x)
if standModel == 'robust':
scaler = RobustScaler().fit(x)
x_train_scaled = scaler.transform(x) # this will return an array
# Convert the array to a pandas dataframe
x_train_scaled = pd.DataFrame(x_train_scaled, columns = num_features,index = x_train.index)
x_train = pd.concat([x_train_scaled,x_train[cat_features]],axis=1)
x_val_scaled = pd.DataFrame(scaler.transform(x_val[num_features]), index = x_val.index,columns=num_features)
x_val = pd.concat([x_val_scaled,x_val[cat_features]],axis=1)
x_test_scaled = pd.DataFrame(scaler.transform(test[num_features]),index=test.index,columns=num_features)
test = pd.concat([x_test_scaled,test[cat_features]],axis=1)
scaled_data = [x_train,x_val,test]
return(scaled_data)
def pred_withThreshold (model,data,threshold):
y_pred = ((model.predict_proba(data)[:,1]) >threshold).astype(bool)
return(y_pred)
#!pip install imblearn
# New variables for this stage
train['Total_value'] = (train['GoogleAnalytics_PageValue'] * train['Product_Pages']).fillna(value=0)
train['Value_per_second'] =( train['Total_value'] / (train['Product_Duration'])).fillna(value=0)
Tested with and without the use of SMOTE. Here presented is only the best one: With SMOTE
train_no_value = train_out.loc[train_out['GoogleAnalytics_BounceRate']==0]
train_no_value['Total_value'] = (train_no_value['GoogleAnalytics_PageValue'] * train_no_value['Product_Pages']).fillna(value=0)
train_no_value['Value_per_second'] =(train_no_value['Total_value'] / (train_no_value['Product_Duration'])).fillna(value=0)
x_noVal_train, y_noVal_train, x_noVal_val, y_noVal_val = split(train_no_value.drop('Buy',axis=1), train_no_value['Buy'])
teste_no_value = ft_engineer(test)
teste_no_value['Total_value'] = (teste_no_value['GoogleAnalytics_PageValue'] * teste_no_value['Product_Pages']).fillna(value=0)
teste_no_value['Value_per_second'] =(teste_no_value['Total_value'] / (teste_no_value['Product_Duration'])).fillna(value=0)
x_noVal_train_stand, x_noVal_val_stand, test_noVal_stand = scalling(x_noVal_train, x_noVal_val, teste_no_value,'standardize')
set1 = RFE(x_noVal_train_stand,x_noVal_val_stand,y_noVal_train,y_noVal_val,10,RandomForestClassifier(max_depth=3,random_state=0))
set1.index
new_subset1 = ['AccountMng_Pages', 'GoogleAnalytics_ExitRate', 'Value_per_second','Total_value','New_Access','x2_11','Product_Duration_Mins']
The validation score is 0.9210822998872604 The train score is 0.923337091319053
from imblearn.over_sampling import BorderlineSMOTE
borderline2 = BorderlineSMOTE(sampling_strategy=0.25, random_state=0, k_neighbors=2,kind='borderline-2')
borderline2_df1 = borderline2.fit_resample(x_noVal_train_stand,y_noVal_train)
x1_borderline2 = borderline2_df1[0]
y1_borderline2 = borderline2_df1[1]
rf_borderline =RandomForestClassifier(n_estimators=150,max_features=0.6,max_depth=3,random_state=0)
rf_border_fit = fit_assess(rf_borderline,x1_borderline2[new_subset1], y1_borderline2,x_noVal_val_stand[new_subset1],y_noVal_val)
print(f'The f1 score of the validation set is: {f1_score(y_noVal_val, rf_border_fit.predict(x_noVal_val_stand[new_subset1]))}')
Avg F1 Score (Train): 0.8011000000000001 Avg F1 Score (Validation): 0.791 Std F1 Score (Train): 0.001100504934614613 Std F1 Score (Validation): 0.01136270703270622 The R2 of the model is 0.9131905298759865 f1_train recall_train precision_train f1_test recall_test \ 0 0.802 0.829 0.777 0.809 0.792 1 0.803 0.823 0.783 0.770 0.757 2 0.799 0.817 0.781 0.797 0.806 3 0.800 0.821 0.781 0.787 0.799 4 0.802 0.821 0.784 0.777 0.795 5 0.801 0.820 0.783 0.788 0.811 6 0.801 0.820 0.782 0.792 0.810 7 0.801 0.821 0.783 0.794 0.813 8 0.801 0.821 0.782 0.802 0.820 9 0.801 0.821 0.783 0.794 0.813 precision_test 0 0.826 1 0.784 2 0.790 3 0.777 4 0.761 5 0.768 6 0.776 7 0.778 8 0.786 9 0.778 The f1 score of the validation set is: 0.7741935483870969
prediction_Novalue = pd.DataFrame(pred_withThreshold(rf_border_fit,test_noVal_stand[new_subset1], 0.45), index = test_noVal_stand.index,columns=['Buy'])
#prediction_Novalue.mean()
Tested with and without the use of SMOTE. Here presented is only the best one: With SMOTE
train_value = train_out.loc[train_out['GoogleAnalytics_BounceRate']>0]
train_value['Total_value'] = (train_value['GoogleAnalytics_PageValue'] * train_value['Product_Pages']).fillna(value=0)
train_value['Value_per_second'] =(train_value['Total_value'] / (train_value['Product_Duration'])).fillna(value=0)
x_Val_train, y_Val_train, x_Val_val, y_Val_val = split(train_value.drop('Buy',axis=1), train_value['Buy'])
teste_value = ft_engineer(test)
teste_value['Total_value'] = (teste_value['GoogleAnalytics_PageValue'] * teste_value['Product_Pages']).fillna(value=0)
teste_value['Value_per_second'] =(teste_value['Total_value'] / (teste_value['Product_Duration'])).fillna(value=0)
teste_value = teste_value.loc[teste_value['GoogleAnalytics_BounceRate']>0]
x_Val_train_stand, x_Val_val_stand, test_Val_stand = scalling(x_Val_train, x_Val_val, teste_value,'standardize')
set2 = RFE(x_Val_train_stand,x_Val_val_stand,y_Val_train,y_Val_val,15,RandomForestClassifier(max_depth=3,random_state=0))
new_subset2 =['AccountMng_Pages','AccountMng_Duration_Mins','GoogleAnalytics_BounceRate','Value_per_second','Total_value','Product_Pages','diff_Exit_Bounce','x2_11','Value_noFAQPage','New_Access']
The validation score is 0.8828911253430924 The train score is 0.8890414092884923
borderline2 = BorderlineSMOTE(sampling_strategy=0.3, random_state=0, k_neighbors=2,kind='borderline-2')
borderline2_df = borderline2.fit_resample(x_Val_train_stand,y_Val_train)
x_borderline2 = borderline2_df[0]
y_borderline2 = borderline2_df[1]
model_ =RandomForestClassifier(n_estimators=150,max_depth=4,max_features=0.8,random_state=0)
model_fit = fit_assess(model_,x_borderline2[new_subset2], y_borderline2,x_Val_val_stand[new_subset2], y_Val_val)
print(f'The f1 score of the validation set is: {f1_score(y_Val_val, model_fit.predict(x_Val_val_stand[new_subset2]))}')
Avg F1 Score (Train): 0.7195999999999999 Avg F1 Score (Validation): 0.6973999999999999 Std F1 Score (Train): 0.0030258148581093942 Std F1 Score (Validation): 0.013023056476879744 The R2 of the model is 0.8810612991765783 f1_train recall_train precision_train f1_test recall_test \ 0 0.725 0.707 0.743 0.661 0.635 1 0.724 0.713 0.735 0.700 0.670 2 0.719 0.702 0.738 0.702 0.675 3 0.721 0.704 0.739 0.698 0.663 4 0.721 0.705 0.737 0.699 0.670 5 0.717 0.698 0.738 0.705 0.668 6 0.717 0.698 0.738 0.705 0.673 7 0.718 0.697 0.740 0.699 0.666 8 0.717 0.694 0.741 0.702 0.675 9 0.717 0.695 0.742 0.703 0.679 precision_test 0 0.689 1 0.734 2 0.731 3 0.738 4 0.732 5 0.747 6 0.743 7 0.738 8 0.733 9 0.732 The f1 score of the validation set is: 0.5517241379310345
prediction_value = pd.DataFrame(pred_withThreshold(model_fit,test_Val_stand[new_subset2], 0.45), index = test_Val_stand.index,columns=['Buy'])
#prediction_value.mean()
full_prediction = pd.concat([prediction_Novalue, prediction_value],axis=0)
full_prediction.mean()
Buy 0.182762 dtype: float64